diff --git a/app/Console/Commands/ScrapeDhiyaresCommand.php b/app/Console/Commands/ScrapeDhiyaresCommand.php index 6f597f4..9e4a5ea 100644 --- a/app/Console/Commands/ScrapeDhiyaresCommand.php +++ b/app/Console/Commands/ScrapeDhiyaresCommand.php @@ -45,6 +45,7 @@ class ScrapeDhiyaresCommand extends Command $articles = (new DhiyaresService)->scrape(); + foreach ($articles as $article) { // Attach the relationship between source and article and return the curren article instance diff --git a/app/Services/DhiyaresService.php b/app/Services/DhiyaresService.php index 355a5cf..794b410 100644 --- a/app/Services/DhiyaresService.php +++ b/app/Services/DhiyaresService.php @@ -14,15 +14,16 @@ class DhiyaresService extends Client */ public function scrape(): array { - //Return only the rss that contains "news" keyboard in its url $articles = (new DhiyaresFeed)->get(); - - $articlesitems = []; - //Looping through the articles and scraping and while scraping it creates a new instance of the scraper. + + $articleItems = []; foreach ($articles as $article) { - $articlesitems[] = (new DhiyaresScraper)->extract($article["link"], $article["date"]); + $scrapedData = (new DhiyaresScraper)->extract($article["link"], $article["date"]); + if ($scrapedData !== null) { + $articlesitems[] = $scrapedData; + } } - return $articlesitems; + return $articleItems; } } diff --git a/app/Services/Scrapers/DhiyaresScraper.php b/app/Services/Scrapers/DhiyaresScraper.php index 03ad971..5762600 100644 --- a/app/Services/Scrapers/DhiyaresScraper.php +++ b/app/Services/Scrapers/DhiyaresScraper.php @@ -13,6 +13,7 @@ class DhiyaresScraper protected $content; protected $topics = []; protected $author = "unknown"; + protected $image; public function __construct() { @@ -24,38 +25,50 @@ class DhiyaresScraper $crawler = $this->client->request('GET', $url); - $crawler->filter('.content p')->each(function ($node) { + $ogTitle = $crawler->filter('title')->first()->text(); + + if (strpos($ogTitle, 'Gallery') === 0) { + return null; + } + + $crawler->filter('.content .block-text')->each(function ($node) { $this->content[] = $node->text(); }); + $crawler->filter('a[class*="bg-blue-50 border border-blue-50 font-normal text-blue-400 hover:text-white hover:bg-base p-1 px-4 mx-2 mb-4 rounded-full font-mv-bold text-base"]')->each(function ($node) { - if(!preg_match('/[^A-Za-z0-9-]/', basename($node->attr('href')))) - { - $this->topics[] = [ - "name" => $node->text(), - "slug" => basename($node->attr('href')) - ]; - } - + if (!preg_match('/[^A-Za-z0-9-]/', basename($node->attr('href')))) { + $this->topics[] = [ + "name" => $node->text(), + "slug" => basename($node->attr('href')) + ]; + } }); + if ($crawler->filter("figure img")->count() > 0) { + $this->image = $crawler->filter("figure img")->first()->attr('src'); + } else { + $this->image = ''; // or a default image path + } + if ($crawler->filter('div[class*="text-base font-mv-bold"] a[class*="py-4"]')->count() == 1) { $this->author = $crawler->filter('div[class*="text-base font-mv-bold"] a[class*="py-4"]')->first()->text(); } //Remove all the alphabets from string //preg_replace("/[a-zA-Z]/", "",$string); + return [ 'source' => 'Dhiyares', 'title' => $crawler->filter('h1')->first()->text(), 'og_title' => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'), - 'image' => $crawler->filter("figure img")->first()->attr('src'), + 'image' => $this->image, 'content' => $this->content, 'url' => $url, 'date' => Carbon::parse($date)->format("Y-m-d H:i:s"), - 'guid' =>basename($url), + 'guid' => basename($url), 'author' => $this->author, - 'topics' => $this->topics ? : [ + 'topics' => $this->topics ?: [ [ "name" => "ވަކި މަޢުލޫއެއް ނޭންގެ", "slug" => "no-specific-topic"