Fix mihaaru scraper
This commit is contained in:
@@ -25,9 +25,9 @@ class Kernel extends ConsoleKernel
|
|||||||
*/
|
*/
|
||||||
protected function schedule(Schedule $schedule)
|
protected function schedule(Schedule $schedule)
|
||||||
{
|
{
|
||||||
// $schedule->command('scrape:mihaaru')->everyFiveMinutes()
|
$schedule->command('scrape:mihaaru')->everyFiveMinutes()
|
||||||
// ->runInBackground()
|
->runInBackground()
|
||||||
// ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru");
|
->pingOnSuccess(config('app.url') . "/api/ping/mihaaru");
|
||||||
|
|
||||||
$schedule->command('scrape:sun')->everyFiveMinutes()
|
$schedule->command('scrape:sun')->everyFiveMinutes()
|
||||||
->runInBackground()
|
->runInBackground()
|
||||||
|
@@ -20,7 +20,6 @@ class SourcesAPIController extends Controller
|
|||||||
{
|
{
|
||||||
return Cache::remember('sources.index', 300, function () {
|
return Cache::remember('sources.index', 300, function () {
|
||||||
return SourceResource::collection(Source::whereNotIn('slug',[
|
return SourceResource::collection(Source::whereNotIn('slug',[
|
||||||
'mihaaru',
|
|
||||||
'hama',
|
'hama',
|
||||||
'zaviyani',
|
'zaviyani',
|
||||||
'funadhoo-times',
|
'funadhoo-times',
|
||||||
|
68
app/Services/Feeds/MihaaruFeed.php
Normal file
68
app/Services/Feeds/MihaaruFeed.php
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
<?php
|
||||||
|
namespace App\Services\Feeds;
|
||||||
|
|
||||||
|
use Illuminate\Support\Facades\Http;
|
||||||
|
use Carbon\Carbon;
|
||||||
|
|
||||||
|
class MihaaruFeed implements Feed
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Get all the latest news
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function get() : array
|
||||||
|
{
|
||||||
|
$response = Http::withOptions([
|
||||||
|
'proxy' => config('karudhaas.proxy.host')
|
||||||
|
])->withHeaders([
|
||||||
|
'Referer' => 'https://mihaaru.com/?ref=mhr-lm',
|
||||||
|
])
|
||||||
|
->get('https://mihaaru.com/api/home/latest-popular-weekly?type=latest')
|
||||||
|
->json();
|
||||||
|
|
||||||
|
$feeds = [];
|
||||||
|
foreach ($response['data'] as $item) {
|
||||||
|
// Approximate the date from the human-readable format
|
||||||
|
$date = $this->approximateDateFromHumanTime($item['human_time']);
|
||||||
|
|
||||||
|
$feeds[] = [
|
||||||
|
"title" => $item['short_headline'],
|
||||||
|
"link" => $item['link'],
|
||||||
|
"date" => $date
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
return $feeds;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Approximates the date from a human-readable time format.
|
||||||
|
*
|
||||||
|
* @param string $humanTime
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function approximateDateFromHumanTime($humanTime)
|
||||||
|
{
|
||||||
|
$now = Carbon::now();
|
||||||
|
|
||||||
|
// Example pattern: "11 hr", "1 day"
|
||||||
|
if(preg_match('/(\d+)\s*(hr|hour|day|days)/', $humanTime, $matches)) {
|
||||||
|
$number = $matches[1];
|
||||||
|
$unit = $matches[2];
|
||||||
|
|
||||||
|
switch ($unit) {
|
||||||
|
case 'hr':
|
||||||
|
case 'hour':
|
||||||
|
return $now->subHours($number)->toDateTimeString();
|
||||||
|
case 'day':
|
||||||
|
case 'days':
|
||||||
|
return $now->subDays($number)->toDateTimeString();
|
||||||
|
default:
|
||||||
|
return $now->toDateTimeString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $now->toDateTimeString();
|
||||||
|
}
|
||||||
|
}
|
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
namespace App\Services;
|
namespace App\Services;
|
||||||
|
|
||||||
|
use App\Services\Feeds\MihaaruFeed;
|
||||||
use App\Services\Scrapers\MihaaruScraper;
|
use App\Services\Scrapers\MihaaruScraper;
|
||||||
use Illuminate\Support\Str;
|
use Illuminate\Support\Str;
|
||||||
|
|
||||||
@@ -15,17 +16,11 @@ class MihaaruService extends Client
|
|||||||
public function scrape(): array
|
public function scrape(): array
|
||||||
{
|
{
|
||||||
//Return only the rss that contains "news" keyboard in its url
|
//Return only the rss that contains "news" keyboard in its url
|
||||||
$articles = collect($this->get("https://mihaaru.com/rss")["channel"]["item"])
|
$articles = (new MihaaruFeed)->get();
|
||||||
->filter(function ($item, $key) {
|
|
||||||
return Str::of($item["link"])->contains(['news']);
|
|
||||||
});
|
|
||||||
|
|
||||||
$articlesitems = [];
|
$articlesitems = [];
|
||||||
//Looping through the articles and scraping and while scraping it creates a new instance of the scraper.
|
//Looping through the articles and scraping and while scraping it creates a new instance of the scraper.
|
||||||
foreach ($articles as $article) {
|
foreach ($articles as $article) {
|
||||||
$link = $article['link'];
|
$articlesitems[] = (new MihaaruScraper)->extract($article['link'], $article['date']);
|
||||||
$date = $article['pubDate'];
|
|
||||||
$articlesitems[] = (new MihaaruScraper)->extract($link, $date);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $articlesitems;
|
return $articlesitems;
|
||||||
|
@@ -3,6 +3,7 @@
|
|||||||
namespace App\Services\Scrapers;
|
namespace App\Services\Scrapers;
|
||||||
|
|
||||||
use Goutte\Client;
|
use Goutte\Client;
|
||||||
|
use Symfony\Component\HttpClient\HttpClient;
|
||||||
|
|
||||||
class MihaaruScraper
|
class MihaaruScraper
|
||||||
{
|
{
|
||||||
@@ -16,7 +17,11 @@ class MihaaruScraper
|
|||||||
|
|
||||||
public function __construct()
|
public function __construct()
|
||||||
{
|
{
|
||||||
$this->client = new Client;
|
$this->client = new Client(
|
||||||
|
HttpClient::create([
|
||||||
|
"proxy" => config('karudhaas.proxy.host')
|
||||||
|
])
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function extract($url, $date = null)
|
public function extract($url, $date = null)
|
||||||
@@ -28,9 +33,7 @@ class MihaaruScraper
|
|||||||
$this->title = $node->text();
|
$this->title = $node->text();
|
||||||
});
|
});
|
||||||
|
|
||||||
$crawler->filter('.container img')->eq(3)->each(function ($node) {
|
$this->image = $crawler->filter('.w-full.flex.flex-col.items-end.max-w-3xl.mb-10.relative img')->attr('src');
|
||||||
$this->image = $node->attr('src');
|
|
||||||
});
|
|
||||||
|
|
||||||
$crawler->filter('.by-line address')->each(function ($node) {
|
$crawler->filter('.by-line address')->each(function ($node) {
|
||||||
$author = $node->text();
|
$author = $node->text();
|
||||||
@@ -41,15 +44,22 @@ class MihaaruScraper
|
|||||||
$this->author = $cleaneddata;
|
$this->author = $cleaneddata;
|
||||||
});
|
});
|
||||||
|
|
||||||
$crawler->filter('article p')->each(function ($node) {
|
$crawler->filter('.text-faseyha')->each(function ($node) {
|
||||||
$this->content[] = preg_replace("/[a-zA-Z]/","",$node->text());
|
$this->content[] = $node->text();
|
||||||
});
|
});
|
||||||
|
|
||||||
$crawler->filter('.article-tags')->each(function ($node) {
|
$crawler->filter('.items-end a')->each(function ($node) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
$topicName = $node->filter('span')->text();
|
||||||
|
$topicSlug = ltrim($node->attr('href'), '/');
|
||||||
|
} catch (\Throwable $th) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
$this->topics[] = [
|
$this->topics[] = [
|
||||||
"name" => $node->text(),
|
"name" => $topicName,
|
||||||
"slug" => str_replace("https://mihaaru.com/", "", $node->attr('href'))
|
"slug" => $topicSlug
|
||||||
];
|
];
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -63,7 +73,7 @@ class MihaaruScraper
|
|||||||
'content' => $this->content,
|
'content' => $this->content,
|
||||||
'url' => $url,
|
'url' => $url,
|
||||||
'date' => $date,
|
'date' => $date,
|
||||||
'guid' => str_replace("https://mihaaru.com/news/","",$url),
|
'guid' => str_replace("https://mihaaru.com/news/", "", $url),
|
||||||
'author' => $this->author,
|
'author' => $this->author,
|
||||||
'topics' => $this->topics
|
'topics' => $this->topics
|
||||||
];
|
];
|
||||||
|
Reference in New Issue
Block a user