Add adhadhu support

2024-01-07 16:58:18 +05:00
parent 1cc86fbbde
commit 46ead16ef8
11 changed files with 497 additions and 6 deletions
--- a/app/Services/AdhadhuService.php
+++ b/app/Services/AdhadhuService.php
@@ -0,0 +1,29 @@
+<?php
+
+namespace App\Services;
+
+use App\Services\Feeds\AdhadhuFeed;
+use App\Services\Scrapers\AdhadhuScraper;
+
+class AdhadhuService extends Client
+{
+    /**
+     * Scrap all the rss articles from Adhadhu
+     *
+     * @return array
+     */
+    public function scrape(): array
+    {
+        $articles = (new AdhadhuFeed)->get();
+
+        $articleItems = [];
+        foreach ($articles as $article) {
+            $scrapedData = (new AdhadhuScraper)->extract($article["link"], $article["date"]);
+            if ($scrapedData !== null) {
+                $articleItems[] = $scrapedData;
+            }
+        }
+
+        return $articleItems;
+    }
+}
--- a/app/Services/Feeds/AdhadhuFeed.php
+++ b/app/Services/Feeds/AdhadhuFeed.php
@@ -0,0 +1,82 @@
+<?php
+
+namespace App\Services\Feeds;
+
+use Goutte\Client;
+use Illuminate\Support\Carbon;
+
+class AdhadhuFeed implements Feed
+{
+    protected $client;
+
+    public function __construct()
+    {
+        $this->client = new Client();
+    }
+
+    /**
+     * Return the latest articles from Adhadhu
+     *
+     * @return array
+     */
+    public function get(): array
+    {
+        $crawler = $this->client->request('GET', "https://adhadhu.com/category/News");
+
+        $feeds = [];
+
+        // Parse the news articles
+        $crawler->filter('div.category-news div.row div.list a.item, div.category-news div.row div.list a')->each(function ($node) use (&$feeds) {
+            // Extract the details of each article
+            $title = $node->filter('h4')->text();
+            $link = $node->attr('href');
+            $timeText = $node->filter('p.font-11')->text();
+            // Extract the time and convert it to a Carbon instance
+            $date = $this->extractDate($timeText);
+
+            $feeds[] = [
+                "title" => trim($title),
+                "link" => "https://adhadhu.com" . $link,
+                "date" => $date
+            ];
+        });
+
+
+        return $feeds;
+    }
+
+    /**
+     * Extract and format the date from the text
+     *
+     * @param string $timeText
+     * @return string
+     */
+    protected function extractDate($timeText)
+    {
+        // A simple regex to extract numbers and time units (e.g., "minutes", "hours")
+        if (preg_match('/(\d+)\s*(minute|hour|day|second)s?/', $timeText, $matches)) {
+            $number = $matches[1];
+            $unit = $matches[2];
+
+            // Use Carbon's sub method to subtract the time
+            switch ($unit) {
+                case 'minute':
+                    return Carbon::now()->subMinutes($number)->format('Y-m-d H:i:s');
+                case 'hour':
+                    return Carbon::now()->subHours($number)->format('Y-m-d H:i:s');
+                case 'day':
+                    return Carbon::now()->subDays($number)->format('Y-m-d H:i:s');
+                case 'second':
+                    return Carbon::now()->subSeconds($number)->format('Y-m-d H:i:s');
+                default:
+                    // Handle unexpected time unit
+                    return Carbon::now()->format('Y-m-d H:i:s');
+            }
+        } else {
+            // Default to current time if parsing fails
+            return Carbon::now()->format('Y-m-d H:i:s');
+        }
+    }
+
+}
+
--- a/app/Services/Scrapers/AdhadhuScraper.php
+++ b/app/Services/Scrapers/AdhadhuScraper.php
@@ -0,0 +1,65 @@
+<?php
+
+namespace App\Services\Scrapers;
+
+use Goutte\Client;
+use Illuminate\Support\Str;
+
+class AdhadhuScraper
+{
+    protected $client;
+
+    protected $title;
+    protected $content;
+    protected $image;
+    protected $topics = [];
+    protected $author;
+
+    public function __construct()
+    {
+        $this->client = new Client;
+    }
+
+    public function extract($url, $date = null)
+    {
+        $crawler = $this->client->request('GET', $url);
+
+        // Extract title
+        $this->title = $crawler->filter('h1.font-52')->first()->text();
+
+        // Extract image URL
+        $this->image = $crawler->filter('img.img-fluid.hero-img')->first()->attr('src');
+
+        // Extract author name
+        $this->author = $crawler->filter('.MuiAvatar-circle img')->first()->attr('alt');
+
+        // Extract content
+        $crawler->filter('.body > p')->each(function ($node) {
+            $this->content[] = $node->text();
+        });
+
+        // Extract topics (tags)
+        $crawler->filter('a[href^="/tags/"]')->each(function ($node) {
+            $href = $node->attr('href');
+            $slug = basename($href); // Extracts the last segment of the URL
+
+            $this->topics[] = [
+                "name" => trim($node->filter('.tag')->first()->text()),
+                "slug" => Str::slug($slug)
+            ];
+        });
+
+        return [
+            'source'    => 'Adhadhu',
+            'title'     => $this->title,
+            'og_title'   => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'),
+            'image'     => $this->image,
+            'content'   => $this->content,
+            'url'       => $url,
+            'date'      => $date,
+            'guid'      => str_replace("https://adhadhu.com/news/","",$url),
+            'author'    => $this->author,
+            'topics'    => $this->topics
+        ];
+    }
+}