Add adhadhu support
This commit is contained in:
29
app/Services/AdhadhuService.php
Normal file
29
app/Services/AdhadhuService.php
Normal file
@@ -0,0 +1,29 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services;
|
||||
|
||||
use App\Services\Feeds\AdhadhuFeed;
|
||||
use App\Services\Scrapers\AdhadhuScraper;
|
||||
|
||||
class AdhadhuService extends Client
|
||||
{
|
||||
/**
|
||||
* Scrap all the rss articles from Adhadhu
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function scrape(): array
|
||||
{
|
||||
$articles = (new AdhadhuFeed)->get();
|
||||
|
||||
$articleItems = [];
|
||||
foreach ($articles as $article) {
|
||||
$scrapedData = (new AdhadhuScraper)->extract($article["link"], $article["date"]);
|
||||
if ($scrapedData !== null) {
|
||||
$articleItems[] = $scrapedData;
|
||||
}
|
||||
}
|
||||
|
||||
return $articleItems;
|
||||
}
|
||||
}
|
||||
82
app/Services/Feeds/AdhadhuFeed.php
Normal file
82
app/Services/Feeds/AdhadhuFeed.php
Normal file
@@ -0,0 +1,82 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Feeds;
|
||||
|
||||
use Goutte\Client;
|
||||
use Illuminate\Support\Carbon;
|
||||
|
||||
class AdhadhuFeed implements Feed
|
||||
{
|
||||
protected $client;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->client = new Client();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the latest articles from Adhadhu
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function get(): array
|
||||
{
|
||||
$crawler = $this->client->request('GET', "https://adhadhu.com/category/News");
|
||||
|
||||
$feeds = [];
|
||||
|
||||
// Parse the news articles
|
||||
$crawler->filter('div.category-news div.row div.list a.item, div.category-news div.row div.list a')->each(function ($node) use (&$feeds) {
|
||||
// Extract the details of each article
|
||||
$title = $node->filter('h4')->text();
|
||||
$link = $node->attr('href');
|
||||
$timeText = $node->filter('p.font-11')->text();
|
||||
// Extract the time and convert it to a Carbon instance
|
||||
$date = $this->extractDate($timeText);
|
||||
|
||||
$feeds[] = [
|
||||
"title" => trim($title),
|
||||
"link" => "https://adhadhu.com" . $link,
|
||||
"date" => $date
|
||||
];
|
||||
});
|
||||
|
||||
|
||||
return $feeds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and format the date from the text
|
||||
*
|
||||
* @param string $timeText
|
||||
* @return string
|
||||
*/
|
||||
protected function extractDate($timeText)
|
||||
{
|
||||
// A simple regex to extract numbers and time units (e.g., "minutes", "hours")
|
||||
if (preg_match('/(\d+)\s*(minute|hour|day|second)s?/', $timeText, $matches)) {
|
||||
$number = $matches[1];
|
||||
$unit = $matches[2];
|
||||
|
||||
// Use Carbon's sub method to subtract the time
|
||||
switch ($unit) {
|
||||
case 'minute':
|
||||
return Carbon::now()->subMinutes($number)->format('Y-m-d H:i:s');
|
||||
case 'hour':
|
||||
return Carbon::now()->subHours($number)->format('Y-m-d H:i:s');
|
||||
case 'day':
|
||||
return Carbon::now()->subDays($number)->format('Y-m-d H:i:s');
|
||||
case 'second':
|
||||
return Carbon::now()->subSeconds($number)->format('Y-m-d H:i:s');
|
||||
default:
|
||||
// Handle unexpected time unit
|
||||
return Carbon::now()->format('Y-m-d H:i:s');
|
||||
}
|
||||
} else {
|
||||
// Default to current time if parsing fails
|
||||
return Carbon::now()->format('Y-m-d H:i:s');
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
65
app/Services/Scrapers/AdhadhuScraper.php
Normal file
65
app/Services/Scrapers/AdhadhuScraper.php
Normal file
@@ -0,0 +1,65 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Scrapers;
|
||||
|
||||
use Goutte\Client;
|
||||
use Illuminate\Support\Str;
|
||||
|
||||
class AdhadhuScraper
|
||||
{
|
||||
protected $client;
|
||||
|
||||
protected $title;
|
||||
protected $content;
|
||||
protected $image;
|
||||
protected $topics = [];
|
||||
protected $author;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->client = new Client;
|
||||
}
|
||||
|
||||
public function extract($url, $date = null)
|
||||
{
|
||||
$crawler = $this->client->request('GET', $url);
|
||||
|
||||
// Extract title
|
||||
$this->title = $crawler->filter('h1.font-52')->first()->text();
|
||||
|
||||
// Extract image URL
|
||||
$this->image = $crawler->filter('img.img-fluid.hero-img')->first()->attr('src');
|
||||
|
||||
// Extract author name
|
||||
$this->author = $crawler->filter('.MuiAvatar-circle img')->first()->attr('alt');
|
||||
|
||||
// Extract content
|
||||
$crawler->filter('.body > p')->each(function ($node) {
|
||||
$this->content[] = $node->text();
|
||||
});
|
||||
|
||||
// Extract topics (tags)
|
||||
$crawler->filter('a[href^="/tags/"]')->each(function ($node) {
|
||||
$href = $node->attr('href');
|
||||
$slug = basename($href); // Extracts the last segment of the URL
|
||||
|
||||
$this->topics[] = [
|
||||
"name" => trim($node->filter('.tag')->first()->text()),
|
||||
"slug" => Str::slug($slug)
|
||||
];
|
||||
});
|
||||
|
||||
return [
|
||||
'source' => 'Adhadhu',
|
||||
'title' => $this->title,
|
||||
'og_title' => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'),
|
||||
'image' => $this->image,
|
||||
'content' => $this->content,
|
||||
'url' => $url,
|
||||
'date' => $date,
|
||||
'guid' => str_replace("https://adhadhu.com/news/","",$url),
|
||||
'author' => $this->author,
|
||||
'topics' => $this->topics
|
||||
];
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user