Fix mihaaru scraper

This commit is contained in:
Mohamed jinas
2024-01-13 04:53:31 +05:00
parent 252d750d7d
commit 736181d461
5 changed files with 96 additions and 24 deletions

View File

@@ -25,9 +25,9 @@ class Kernel extends ConsoleKernel
*/ */
protected function schedule(Schedule $schedule) protected function schedule(Schedule $schedule)
{ {
// $schedule->command('scrape:mihaaru')->everyFiveMinutes() $schedule->command('scrape:mihaaru')->everyFiveMinutes()
// ->runInBackground() ->runInBackground()
// ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru"); ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru");
$schedule->command('scrape:sun')->everyFiveMinutes() $schedule->command('scrape:sun')->everyFiveMinutes()
->runInBackground() ->runInBackground()

View File

@@ -20,7 +20,6 @@ class SourcesAPIController extends Controller
{ {
return Cache::remember('sources.index', 300, function () { return Cache::remember('sources.index', 300, function () {
return SourceResource::collection(Source::whereNotIn('slug',[ return SourceResource::collection(Source::whereNotIn('slug',[
'mihaaru',
'hama', 'hama',
'zaviyani', 'zaviyani',
'funadhoo-times', 'funadhoo-times',

View File

@@ -0,0 +1,68 @@
<?php
namespace App\Services\Feeds;
use Illuminate\Support\Facades\Http;
use Carbon\Carbon;
class MihaaruFeed implements Feed
{
/**
* Get all the latest news
*
* @return array
*/
public function get() : array
{
$response = Http::withOptions([
'proxy' => config('karudhaas.proxy.host')
])->withHeaders([
'Referer' => 'https://mihaaru.com/?ref=mhr-lm',
])
->get('https://mihaaru.com/api/home/latest-popular-weekly?type=latest')
->json();
$feeds = [];
foreach ($response['data'] as $item) {
// Approximate the date from the human-readable format
$date = $this->approximateDateFromHumanTime($item['human_time']);
$feeds[] = [
"title" => $item['short_headline'],
"link" => $item['link'],
"date" => $date
];
}
return $feeds;
}
/**
* Approximates the date from a human-readable time format.
*
* @param string $humanTime
* @return string
*/
protected function approximateDateFromHumanTime($humanTime)
{
$now = Carbon::now();
// Example pattern: "11 hr", "1 day"
if(preg_match('/(\d+)\s*(hr|hour|day|days)/', $humanTime, $matches)) {
$number = $matches[1];
$unit = $matches[2];
switch ($unit) {
case 'hr':
case 'hour':
return $now->subHours($number)->toDateTimeString();
case 'day':
case 'days':
return $now->subDays($number)->toDateTimeString();
default:
return $now->toDateTimeString();
}
}
return $now->toDateTimeString();
}
}

View File

@@ -2,6 +2,7 @@
namespace App\Services; namespace App\Services;
use App\Services\Feeds\MihaaruFeed;
use App\Services\Scrapers\MihaaruScraper; use App\Services\Scrapers\MihaaruScraper;
use Illuminate\Support\Str; use Illuminate\Support\Str;
@@ -15,17 +16,11 @@ class MihaaruService extends Client
public function scrape(): array public function scrape(): array
{ {
//Return only the rss that contains "news" keyboard in its url //Return only the rss that contains "news" keyboard in its url
$articles = collect($this->get("https://mihaaru.com/rss")["channel"]["item"]) $articles = (new MihaaruFeed)->get();
->filter(function ($item, $key) {
return Str::of($item["link"])->contains(['news']);
});
$articlesitems = []; $articlesitems = [];
//Looping through the articles and scraping and while scraping it creates a new instance of the scraper. //Looping through the articles and scraping and while scraping it creates a new instance of the scraper.
foreach ($articles as $article) { foreach ($articles as $article) {
$link = $article['link']; $articlesitems[] = (new MihaaruScraper)->extract($article['link'], $article['date']);
$date = $article['pubDate'];
$articlesitems[] = (new MihaaruScraper)->extract($link, $date);
} }
return $articlesitems; return $articlesitems;

View File

@@ -3,6 +3,7 @@
namespace App\Services\Scrapers; namespace App\Services\Scrapers;
use Goutte\Client; use Goutte\Client;
use Symfony\Component\HttpClient\HttpClient;
class MihaaruScraper class MihaaruScraper
{ {
@@ -16,7 +17,11 @@ class MihaaruScraper
public function __construct() public function __construct()
{ {
$this->client = new Client; $this->client = new Client(
HttpClient::create([
"proxy" => config('karudhaas.proxy.host')
])
);
} }
public function extract($url, $date = null) public function extract($url, $date = null)
@@ -28,9 +33,7 @@ class MihaaruScraper
$this->title = $node->text(); $this->title = $node->text();
}); });
$crawler->filter('.container img')->eq(3)->each(function ($node) { $this->image = $crawler->filter('.w-full.flex.flex-col.items-end.max-w-3xl.mb-10.relative img')->attr('src');
$this->image = $node->attr('src');
});
$crawler->filter('.by-line address')->each(function ($node) { $crawler->filter('.by-line address')->each(function ($node) {
$author = $node->text(); $author = $node->text();
@@ -41,15 +44,22 @@ class MihaaruScraper
$this->author = $cleaneddata; $this->author = $cleaneddata;
}); });
$crawler->filter('article p')->each(function ($node) { $crawler->filter('.text-faseyha')->each(function ($node) {
$this->content[] = preg_replace("/[a-zA-Z]/","",$node->text()); $this->content[] = $node->text();
}); });
$crawler->filter('.article-tags')->each(function ($node) { $crawler->filter('.items-end a')->each(function ($node) {
try {
$topicName = $node->filter('span')->text();
$topicSlug = ltrim($node->attr('href'), '/');
} catch (\Throwable $th) {
return;
}
$this->topics[] = [ $this->topics[] = [
"name" => $node->text(), "name" => $topicName,
"slug" => str_replace("https://mihaaru.com/", "", $node->attr('href')) "slug" => $topicSlug
]; ];
}); });