Add adhadhu support

This commit is contained in:
Mohamed jinas
2024-01-07 16:58:18 +05:00
parent 1cc86fbbde
commit 46ead16ef8
11 changed files with 497 additions and 6 deletions

View File

@@ -0,0 +1,73 @@
<?php
namespace App\Console\Commands;
use Illuminate\Console\Command;
use App\Source;
use App\Services\AdhadhuService;
use App\Topic;
use Illuminate\Support\Carbon;
class ScrapeAdhadhuCommand extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'scrape:adhadhu';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Scrape Adhadhu';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return int
*/
public function handle()
{
$source = Source::where('slug', 'adhadhu')->first();
$articles = (new AdhadhuService)->scrape();
foreach ($articles as $article) {
$articleModel = $source->articles()->updateOrCreate(
["guid" => $article["guid"]],
[
"title" => $article["title"],
"url" => $article["url"],
"author" => $article["author"],
"featured_image" => $article["image"],
"body" => $article["content"],
"published_date" => $article["date"],
"meta" => [
"title" => $article["og_title"]
]
]
);
collect($article["topics"])->each(function ($topic) use ($articleModel) {
$topicModel = Topic::firstOrCreate(["slug" => $topic["slug"]], ["name" => $topic["name"]]);
$topicModel->articles()->syncWithoutDetaching($articleModel);
});
}
return 0;
}
}

View File

@@ -101,8 +101,13 @@ class Kernel extends ConsoleKernel
$schedule->command('scrape:dhiyares')->everyFiveMinutes()
->runInBackground()
->pingOnSuccess(config('app.url') . "/api/ping/dhiyares");
$schedule->command('scrape:adhadhu')->everyFiveMinutes()
->runInBackground()
->pingOnSuccess(config('app.url') . "/api/ping/adhadhu");
}
/**
* Register the commands for the application.
*

View File

@@ -0,0 +1,29 @@
<?php
namespace App\Services;
use App\Services\Feeds\AdhadhuFeed;
use App\Services\Scrapers\AdhadhuScraper;
class AdhadhuService extends Client
{
/**
* Scrap all the rss articles from Adhadhu
*
* @return array
*/
public function scrape(): array
{
$articles = (new AdhadhuFeed)->get();
$articleItems = [];
foreach ($articles as $article) {
$scrapedData = (new AdhadhuScraper)->extract($article["link"], $article["date"]);
if ($scrapedData !== null) {
$articleItems[] = $scrapedData;
}
}
return $articleItems;
}
}

View File

@@ -0,0 +1,82 @@
<?php
namespace App\Services\Feeds;
use Goutte\Client;
use Illuminate\Support\Carbon;
class AdhadhuFeed implements Feed
{
protected $client;
public function __construct()
{
$this->client = new Client();
}
/**
* Return the latest articles from Adhadhu
*
* @return array
*/
public function get(): array
{
$crawler = $this->client->request('GET', "https://adhadhu.com/category/News");
$feeds = [];
// Parse the news articles
$crawler->filter('div.category-news div.row div.list a.item, div.category-news div.row div.list a')->each(function ($node) use (&$feeds) {
// Extract the details of each article
$title = $node->filter('h4')->text();
$link = $node->attr('href');
$timeText = $node->filter('p.font-11')->text();
// Extract the time and convert it to a Carbon instance
$date = $this->extractDate($timeText);
$feeds[] = [
"title" => trim($title),
"link" => "https://adhadhu.com" . $link,
"date" => $date
];
});
return $feeds;
}
/**
* Extract and format the date from the text
*
* @param string $timeText
* @return string
*/
protected function extractDate($timeText)
{
// A simple regex to extract numbers and time units (e.g., "minutes", "hours")
if (preg_match('/(\d+)\s*(minute|hour|day|second)s?/', $timeText, $matches)) {
$number = $matches[1];
$unit = $matches[2];
// Use Carbon's sub method to subtract the time
switch ($unit) {
case 'minute':
return Carbon::now()->subMinutes($number)->format('Y-m-d H:i:s');
case 'hour':
return Carbon::now()->subHours($number)->format('Y-m-d H:i:s');
case 'day':
return Carbon::now()->subDays($number)->format('Y-m-d H:i:s');
case 'second':
return Carbon::now()->subSeconds($number)->format('Y-m-d H:i:s');
default:
// Handle unexpected time unit
return Carbon::now()->format('Y-m-d H:i:s');
}
} else {
// Default to current time if parsing fails
return Carbon::now()->format('Y-m-d H:i:s');
}
}
}

View File

@@ -0,0 +1,65 @@
<?php
namespace App\Services\Scrapers;
use Goutte\Client;
use Illuminate\Support\Str;
class AdhadhuScraper
{
protected $client;
protected $title;
protected $content;
protected $image;
protected $topics = [];
protected $author;
public function __construct()
{
$this->client = new Client;
}
public function extract($url, $date = null)
{
$crawler = $this->client->request('GET', $url);
// Extract title
$this->title = $crawler->filter('h1.font-52')->first()->text();
// Extract image URL
$this->image = $crawler->filter('img.img-fluid.hero-img')->first()->attr('src');
// Extract author name
$this->author = $crawler->filter('.MuiAvatar-circle img')->first()->attr('alt');
// Extract content
$crawler->filter('.body > p')->each(function ($node) {
$this->content[] = $node->text();
});
// Extract topics (tags)
$crawler->filter('a[href^="/tags/"]')->each(function ($node) {
$href = $node->attr('href');
$slug = basename($href); // Extracts the last segment of the URL
$this->topics[] = [
"name" => trim($node->filter('.tag')->first()->text()),
"slug" => Str::slug($slug)
];
});
return [
'source' => 'Adhadhu',
'title' => $this->title,
'og_title' => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'),
'image' => $this->image,
'content' => $this->content,
'url' => $url,
'date' => $date,
'guid' => str_replace("https://adhadhu.com/news/","",$url),
'author' => $this->author,
'topics' => $this->topics
];
}
}