Bug Fixes and thiladhun scraper intergration

2020-08-11 03:39:08 +05:00
parent 0cede5b708
commit 5df4011f13
14 changed files with 4478 additions and 137 deletions
@@ -0,0 +1,75 @@
+<?php
+
+namespace App\Console\Commands;
+
+use Illuminate\Console\Command;
+use App\Source;
+use App\Services\ThiladhunService;
+use App\Topic;
+use Illuminate\Support\Carbon;
+
+class ScrapeThiladhunCommand extends Command
+{
+    /**
+     * The name and signature of the console command.
+     *
+     * @var string
+     */
+    protected $signature = 'scrape:thiladhun';
+
+    /**
+     * The console command description.
+     *
+     * @var string
+     */
+    protected $description = 'Scrape Thiladhun';
+
+    /**
+     * Create a new command instance.
+     *
+     * @return void
+     */
+    public function __construct()
+    {
+        parent::__construct();
+    }
+
+    /**
+     * Execute the console command.
+     *
+     * @return int
+     */
+    public function handle()
+    {
+        $source = Source::where('slug', 'thiladhun')->first();
+
+        $articles = (new ThiladhunService)->scrape();
+
+        foreach ($articles as $article) {
+
+            // Attach the relationship between source and article and return the curren article instance
+            $articleModel = $source->articles()->firstOrCreate([
+                "title" => $article["title"],
+                "url" => $article["url"],
+                "author" => $article["author"],
+                "featured_image" => $article["image"],
+                "body" => $article["content"],
+                "guid" => $article["guid"],
+                "published_date" => Carbon::parse($article["date"])->format("Y-m-d H:i:s"),
+                "meta" => [
+                    "title" => $article["og_title"]
+                ]
+
+            ]);
+
+            collect($article["topics"])->each(function ($topic) use ($articleModel) {
+                $topicModel = Topic::firstOrCreate([
+                    "name" => $topic["name"],
+                    "slug" => $topic["slug"],
+                ]);
+
+                $topicModel->articles()->syncWithoutDetaching($articleModel);
+            });
+        }
+    }
+}
@@ -15,9 +15,8 @@ final class RecentArticles extends Controller
     */
    public function __invoke()
    {
-        return ArticleResource::collection(Article::with('source', 'topics')
+        return Article::with('source', 'topics')
                ->latest("published_date")
-                ->paginate(8)
-        );
+                ->paginate(8);
    }
 }
@@ -18,11 +18,11 @@ class TodaysPick extends Controller
     */
    public function __invoke()
    {
-        return ArticleResource::collection(Article::with('topics', 'source')
+       return Article::with('topics', 'source')
                        ->whereDate('published_date', Carbon::today())
                        ->inRandomOrder()
                        ->take(8)
                        ->get()
-                        ->unique('source.name'));
+                        ->unique('source.name')->values()->toArray();
    }
 }
@@ -11,7 +11,7 @@ class MihaaruScraper
    protected $title;
    protected $content;
    protected $image;
-    protected $tags = [];
+    protected $topics = [];
    protected $author;

    public function __construct()
@@ -25,13 +25,11 @@ class MihaaruScraper
        $crawler = $this->client->request('GET', $url);

        $crawler->filter('h1')->each(function ($node) {
-            $title = $node->text();
-            $this->title = $title;
+            $this->title = $node->text();
        });

        $crawler->filter('.container  img')->eq(3)->each(function ($node) {
-            $image = $node->attr('src');
-            $this->image = $image;
+            $this->image = $node->attr('src');
        });

        $crawler->filter('.by-line address')->each(function ($node) {
@@ -49,7 +47,7 @@ class MihaaruScraper

        $crawler->filter('.article-tags')->each(function ($node) {
           
-            $this->tags[] = [
+            $this->topics[] = [
                "name" => $node->text(),
                "slug" => str_replace("https://mihaaru.com/", "", $node->attr('href'))
            ];
@@ -57,7 +55,7 @@ class MihaaruScraper

        //Remove all the alphabets from string
        //preg_replace("/[a-zA-Z]/", "",$string);
-        $data = [
+       return [
            'source'    => 'Mihaaru',
            'title'      => $this->title,
            'og_title'   => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'),
@@ -67,9 +65,7 @@ class MihaaruScraper
            'date'       => $date,
            'guid'       => $guid,
            'author'     => $this->author,
-            'topics'       => $this->tags,
+            'topics'       => $this->topics
        ];
-
-        return $data;
    }
 }
@@ -0,0 +1,78 @@
+<?php
+
+namespace App\Services\Scrapers;
+
+use Goutte\Client;
+
+class ThiladhunScraper
+{
+    protected $client;
+
+    protected $title;
+    protected $content;
+    protected $guid;
+    protected $image;
+    protected $author;
+
+    /**
+     * __construct.
+     *
+     * @return void
+     */
+    public function __construct()
+    {
+        $this->client = new Client();
+    }
+
+    /**
+     * extract.
+     *
+     * @param mixed $url
+     * @param mixed $date
+     * @param mixed $guid
+     *
+     * @return array
+     */
+    public function extract($url, $date = null, $guid = null)
+    {
+        $this->guid = str_replace('https://thiladhun.com/', '', $url);
+
+        $crawler = $this->client->request('GET', $url);
+
+        $crawler->filter('h1')->each(function ($node) {
+            $this->title = $node->text();
+        });
+
+
+        $crawler->filter('div.single-body.entry-content.typography-copy p')->each(function ($node) {
+            $this->content[] = preg_replace("/[a-zA-Z]/", "", $node->text());;
+        });
+
+        $crawler->filter('div[class*="entry-thumb single-entry-thumb"] img')->each(function ($node) {
+            $this->image = $node->attr('src');
+        });
+
+        $crawler->filter('a[class*="entry-author__name"]')->each(function ($node) {
+            $this->author = $node->text();
+        });
+
+
+        return [
+            'service'    => 'Thiladhun News',
+            'title'      => $this->title,
+            'og_title'   => str_replace(" | Thiladhun", "", $crawler->filter('title')->first()->text('content')),
+            'image'      => $this->image,
+            'content'    => $this->content,
+            'date'       => $date,
+            'url'        => $url,
+            'author'     => $this->author,
+            'guid'       => $this->guid,
+            'topics'       =>  [
+                [
+                    "name" => "ވަކި މަޢުލޫއެއް ނޭންގެ",
+                    "slug" => "uncategorized"
+                ]
+            ]
+        ];
+    }
+}
@@ -0,0 +1,30 @@
+<?php
+
+namespace App\Services;
+
+use App\Services\Scrapers\ThiladhunScraper;
+
+class ThiladhunService extends Client
+{
+    /**
+     * Scrap all the rss articles from mihaaru
+     *
+     * @return array
+     */
+    public function scrape(): array
+    {
+        //Return only the rss that contains "news" keyboard in its url
+        $articles = $this->get("https://thiladhun.com/feed")["channel"]["item"];
+
+        $articlesitems = [];
+        //Looping through the articles and scraping and while scraping it creates a new instance of the scraper.
+        foreach ($articles as $article) {
+            $link = $article['link'];
+            $date = $article['pubDate'];
+            $guid = $article['guid'];
+            $articlesitems[] = (new ThiladhunScraper)->extract($link, $date, $guid);
+        }
+
+        return $articlesitems;
+    }
+}