From 0687aa1687478111055367d4f18d84cb51611b84 Mon Sep 17 00:00:00 2001 From: Mohamed Jinas Date: Tue, 11 Aug 2020 04:25:56 +0500 Subject: [PATCH] Sun Scraper and service fix --- app/Services/Scrapers/SunScraper.php | 18 +++++++++++------- app/Services/SunService.php | 27 +++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/app/Services/Scrapers/SunScraper.php b/app/Services/Scrapers/SunScraper.php index 541dfb1..e0834ac 100644 --- a/app/Services/Scrapers/SunScraper.php +++ b/app/Services/Scrapers/SunScraper.php @@ -11,7 +11,7 @@ class SunScraper protected $title; protected $content; protected $topics = []; - protected $author; + protected $author = "unknown"; public function __construct() { @@ -23,25 +23,29 @@ class SunScraper $crawler = $this->client->request('GET', $url); - $crawler->filter('h1')->each(function ($node) { + $crawler->filter('.component-article-title h1')->each(function ($node) { $this->title = $node->text(); }); $crawler->filter('.component-article-content p')->each(function ($node) { - $this->content[] = preg_replace("/[a-zA-Z]/","",$node->text()); + $this->content[] = preg_replace("/[a-zA-Z]/", "", $node->text()); }); $crawler->filter('.component-article-tag li a')->each(function ($node) { - + $this->topics[] = [ "name" => $node->first()->text(), "slug" => str_replace("https://sun.mv/", "", $node->first()->attr('href')) ]; }); + if($crawler->filter(".author .name")->count() == 1) + { + $this->author = $crawler->filter(".author .name")->first()->text(); + } //Remove all the alphabets from string //preg_replace("/[a-zA-Z]/", "",$string); - return [ + return [ 'source' => 'Sun', 'title' => $this->title, 'og_title' => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'), @@ -49,8 +53,8 @@ class SunScraper 'content' => $this->content, 'url' => $url, 'date' => $crawler->filter(".author .time")->first()->text(), - 'guid' => str_replace("https://sun.mv/","",$url), - 'author' => $crawler->filter(".author .name")->first()->text(), + 'guid' => str_replace("https://sun.mv/", "", $url), + 'author' => $this->author, 'topics' => collect($this->topics)->unique()->values()->toArray() ]; } diff --git a/app/Services/SunService.php b/app/Services/SunService.php index e69de29..c40887f 100644 --- a/app/Services/SunService.php +++ b/app/Services/SunService.php @@ -0,0 +1,27 @@ +get(); + + $articlesitems = []; + //Looping through the articles and scraping and while scraping it creates a new instance of the scraper. + foreach ($articles as $article) { + $articlesitems[] = (new SunScraper)->extract("https://sun.mv/".$article["id"]); + } + + return $articlesitems; + } +} \ No newline at end of file