Add adhadhu support
This commit is contained in:
73
app/Console/Commands/ScrapeAdhadhuCommand.php
Normal file
73
app/Console/Commands/ScrapeAdhadhuCommand.php
Normal file
@@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
namespace App\Console\Commands;
|
||||
|
||||
use Illuminate\Console\Command;
|
||||
use App\Source;
|
||||
use App\Services\AdhadhuService;
|
||||
use App\Topic;
|
||||
use Illuminate\Support\Carbon;
|
||||
|
||||
class ScrapeAdhadhuCommand extends Command
|
||||
{
|
||||
/**
|
||||
* The name and signature of the console command.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $signature = 'scrape:adhadhu';
|
||||
|
||||
/**
|
||||
* The console command description.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $description = 'Scrape Adhadhu';
|
||||
|
||||
/**
|
||||
* Create a new command instance.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the console command.
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
public function handle()
|
||||
{
|
||||
$source = Source::where('slug', 'adhadhu')->first();
|
||||
|
||||
$articles = (new AdhadhuService)->scrape();
|
||||
|
||||
foreach ($articles as $article) {
|
||||
$articleModel = $source->articles()->updateOrCreate(
|
||||
["guid" => $article["guid"]],
|
||||
[
|
||||
"title" => $article["title"],
|
||||
"url" => $article["url"],
|
||||
"author" => $article["author"],
|
||||
"featured_image" => $article["image"],
|
||||
"body" => $article["content"],
|
||||
"published_date" => $article["date"],
|
||||
"meta" => [
|
||||
"title" => $article["og_title"]
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
collect($article["topics"])->each(function ($topic) use ($articleModel) {
|
||||
$topicModel = Topic::firstOrCreate(["slug" => $topic["slug"]], ["name" => $topic["name"]]);
|
||||
|
||||
$topicModel->articles()->syncWithoutDetaching($articleModel);
|
||||
});
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -101,8 +101,13 @@ class Kernel extends ConsoleKernel
|
||||
$schedule->command('scrape:dhiyares')->everyFiveMinutes()
|
||||
->runInBackground()
|
||||
->pingOnSuccess(config('app.url') . "/api/ping/dhiyares");
|
||||
|
||||
$schedule->command('scrape:adhadhu')->everyFiveMinutes()
|
||||
->runInBackground()
|
||||
->pingOnSuccess(config('app.url') . "/api/ping/adhadhu");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Register the commands for the application.
|
||||
*
|
||||
|
||||
29
app/Services/AdhadhuService.php
Normal file
29
app/Services/AdhadhuService.php
Normal file
@@ -0,0 +1,29 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services;
|
||||
|
||||
use App\Services\Feeds\AdhadhuFeed;
|
||||
use App\Services\Scrapers\AdhadhuScraper;
|
||||
|
||||
class AdhadhuService extends Client
|
||||
{
|
||||
/**
|
||||
* Scrap all the rss articles from Adhadhu
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function scrape(): array
|
||||
{
|
||||
$articles = (new AdhadhuFeed)->get();
|
||||
|
||||
$articleItems = [];
|
||||
foreach ($articles as $article) {
|
||||
$scrapedData = (new AdhadhuScraper)->extract($article["link"], $article["date"]);
|
||||
if ($scrapedData !== null) {
|
||||
$articleItems[] = $scrapedData;
|
||||
}
|
||||
}
|
||||
|
||||
return $articleItems;
|
||||
}
|
||||
}
|
||||
82
app/Services/Feeds/AdhadhuFeed.php
Normal file
82
app/Services/Feeds/AdhadhuFeed.php
Normal file
@@ -0,0 +1,82 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Feeds;
|
||||
|
||||
use Goutte\Client;
|
||||
use Illuminate\Support\Carbon;
|
||||
|
||||
class AdhadhuFeed implements Feed
|
||||
{
|
||||
protected $client;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->client = new Client();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the latest articles from Adhadhu
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function get(): array
|
||||
{
|
||||
$crawler = $this->client->request('GET', "https://adhadhu.com/category/News");
|
||||
|
||||
$feeds = [];
|
||||
|
||||
// Parse the news articles
|
||||
$crawler->filter('div.category-news div.row div.list a.item, div.category-news div.row div.list a')->each(function ($node) use (&$feeds) {
|
||||
// Extract the details of each article
|
||||
$title = $node->filter('h4')->text();
|
||||
$link = $node->attr('href');
|
||||
$timeText = $node->filter('p.font-11')->text();
|
||||
// Extract the time and convert it to a Carbon instance
|
||||
$date = $this->extractDate($timeText);
|
||||
|
||||
$feeds[] = [
|
||||
"title" => trim($title),
|
||||
"link" => "https://adhadhu.com" . $link,
|
||||
"date" => $date
|
||||
];
|
||||
});
|
||||
|
||||
|
||||
return $feeds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and format the date from the text
|
||||
*
|
||||
* @param string $timeText
|
||||
* @return string
|
||||
*/
|
||||
protected function extractDate($timeText)
|
||||
{
|
||||
// A simple regex to extract numbers and time units (e.g., "minutes", "hours")
|
||||
if (preg_match('/(\d+)\s*(minute|hour|day|second)s?/', $timeText, $matches)) {
|
||||
$number = $matches[1];
|
||||
$unit = $matches[2];
|
||||
|
||||
// Use Carbon's sub method to subtract the time
|
||||
switch ($unit) {
|
||||
case 'minute':
|
||||
return Carbon::now()->subMinutes($number)->format('Y-m-d H:i:s');
|
||||
case 'hour':
|
||||
return Carbon::now()->subHours($number)->format('Y-m-d H:i:s');
|
||||
case 'day':
|
||||
return Carbon::now()->subDays($number)->format('Y-m-d H:i:s');
|
||||
case 'second':
|
||||
return Carbon::now()->subSeconds($number)->format('Y-m-d H:i:s');
|
||||
default:
|
||||
// Handle unexpected time unit
|
||||
return Carbon::now()->format('Y-m-d H:i:s');
|
||||
}
|
||||
} else {
|
||||
// Default to current time if parsing fails
|
||||
return Carbon::now()->format('Y-m-d H:i:s');
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
65
app/Services/Scrapers/AdhadhuScraper.php
Normal file
65
app/Services/Scrapers/AdhadhuScraper.php
Normal file
@@ -0,0 +1,65 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Scrapers;
|
||||
|
||||
use Goutte\Client;
|
||||
use Illuminate\Support\Str;
|
||||
|
||||
class AdhadhuScraper
|
||||
{
|
||||
protected $client;
|
||||
|
||||
protected $title;
|
||||
protected $content;
|
||||
protected $image;
|
||||
protected $topics = [];
|
||||
protected $author;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->client = new Client;
|
||||
}
|
||||
|
||||
public function extract($url, $date = null)
|
||||
{
|
||||
$crawler = $this->client->request('GET', $url);
|
||||
|
||||
// Extract title
|
||||
$this->title = $crawler->filter('h1.font-52')->first()->text();
|
||||
|
||||
// Extract image URL
|
||||
$this->image = $crawler->filter('img.img-fluid.hero-img')->first()->attr('src');
|
||||
|
||||
// Extract author name
|
||||
$this->author = $crawler->filter('.MuiAvatar-circle img')->first()->attr('alt');
|
||||
|
||||
// Extract content
|
||||
$crawler->filter('.body > p')->each(function ($node) {
|
||||
$this->content[] = $node->text();
|
||||
});
|
||||
|
||||
// Extract topics (tags)
|
||||
$crawler->filter('a[href^="/tags/"]')->each(function ($node) {
|
||||
$href = $node->attr('href');
|
||||
$slug = basename($href); // Extracts the last segment of the URL
|
||||
|
||||
$this->topics[] = [
|
||||
"name" => trim($node->filter('.tag')->first()->text()),
|
||||
"slug" => Str::slug($slug)
|
||||
];
|
||||
});
|
||||
|
||||
return [
|
||||
'source' => 'Adhadhu',
|
||||
'title' => $this->title,
|
||||
'og_title' => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'),
|
||||
'image' => $this->image,
|
||||
'content' => $this->content,
|
||||
'url' => $url,
|
||||
'date' => $date,
|
||||
'guid' => str_replace("https://adhadhu.com/news/","",$url),
|
||||
'author' => $this->author,
|
||||
'topics' => $this->topics
|
||||
];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user