diff --git a/app/Console/Kernel.php b/app/Console/Kernel.php index 96eba69..21f1e19 100644 --- a/app/Console/Kernel.php +++ b/app/Console/Kernel.php @@ -25,9 +25,9 @@ class Kernel extends ConsoleKernel */ protected function schedule(Schedule $schedule) { - // $schedule->command('scrape:mihaaru')->everyFiveMinutes() - // ->runInBackground() - // ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru"); + $schedule->command('scrape:mihaaru')->everyFiveMinutes() + ->runInBackground() + ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru"); $schedule->command('scrape:sun')->everyFiveMinutes() ->runInBackground() diff --git a/app/Http/Controllers/API/SourcesAPIController.php b/app/Http/Controllers/API/SourcesAPIController.php index 39c75b8..f2f6368 100644 --- a/app/Http/Controllers/API/SourcesAPIController.php +++ b/app/Http/Controllers/API/SourcesAPIController.php @@ -20,7 +20,6 @@ class SourcesAPIController extends Controller { return Cache::remember('sources.index', 300, function () { return SourceResource::collection(Source::whereNotIn('slug',[ - 'mihaaru', 'hama', 'zaviyani', 'funadhoo-times', diff --git a/app/Services/Feeds/MihaaruFeed.php b/app/Services/Feeds/MihaaruFeed.php new file mode 100644 index 0000000..9e4fb70 --- /dev/null +++ b/app/Services/Feeds/MihaaruFeed.php @@ -0,0 +1,68 @@ + config('karudhaas.proxy.host') + ])->withHeaders([ + 'Referer' => 'https://mihaaru.com/?ref=mhr-lm', + ]) + ->get('https://mihaaru.com/api/home/latest-popular-weekly?type=latest') + ->json(); + + $feeds = []; + foreach ($response['data'] as $item) { + // Approximate the date from the human-readable format + $date = $this->approximateDateFromHumanTime($item['human_time']); + + $feeds[] = [ + "title" => $item['short_headline'], + "link" => $item['link'], + "date" => $date + ]; + } + + return $feeds; + } + + /** + * Approximates the date from a human-readable time format. + * + * @param string $humanTime + * @return string + */ + protected function approximateDateFromHumanTime($humanTime) + { + $now = Carbon::now(); + + // Example pattern: "11 hr", "1 day" + if(preg_match('/(\d+)\s*(hr|hour|day|days)/', $humanTime, $matches)) { + $number = $matches[1]; + $unit = $matches[2]; + + switch ($unit) { + case 'hr': + case 'hour': + return $now->subHours($number)->toDateTimeString(); + case 'day': + case 'days': + return $now->subDays($number)->toDateTimeString(); + default: + return $now->toDateTimeString(); + } + } + + return $now->toDateTimeString(); + } +} diff --git a/app/Services/MihaaruService.php b/app/Services/MihaaruService.php index f170e2c..9b28610 100644 --- a/app/Services/MihaaruService.php +++ b/app/Services/MihaaruService.php @@ -2,6 +2,7 @@ namespace App\Services; +use App\Services\Feeds\MihaaruFeed; use App\Services\Scrapers\MihaaruScraper; use Illuminate\Support\Str; @@ -15,17 +16,11 @@ class MihaaruService extends Client public function scrape(): array { //Return only the rss that contains "news" keyboard in its url - $articles = collect($this->get("https://mihaaru.com/rss")["channel"]["item"]) - ->filter(function ($item, $key) { - return Str::of($item["link"])->contains(['news']); - }); - + $articles = (new MihaaruFeed)->get(); $articlesitems = []; //Looping through the articles and scraping and while scraping it creates a new instance of the scraper. foreach ($articles as $article) { - $link = $article['link']; - $date = $article['pubDate']; - $articlesitems[] = (new MihaaruScraper)->extract($link, $date); + $articlesitems[] = (new MihaaruScraper)->extract($article['link'], $article['date']); } return $articlesitems; diff --git a/app/Services/Scrapers/MihaaruScraper.php b/app/Services/Scrapers/MihaaruScraper.php index 59542e3..ed2e2cd 100644 --- a/app/Services/Scrapers/MihaaruScraper.php +++ b/app/Services/Scrapers/MihaaruScraper.php @@ -3,6 +3,7 @@ namespace App\Services\Scrapers; use Goutte\Client; +use Symfony\Component\HttpClient\HttpClient; class MihaaruScraper { @@ -16,7 +17,11 @@ class MihaaruScraper public function __construct() { - $this->client = new Client; + $this->client = new Client( + HttpClient::create([ + "proxy" => config('karudhaas.proxy.host') + ]) + ); } public function extract($url, $date = null) @@ -28,9 +33,7 @@ class MihaaruScraper $this->title = $node->text(); }); - $crawler->filter('.container img')->eq(3)->each(function ($node) { - $this->image = $node->attr('src'); - }); + $this->image = $crawler->filter('.w-full.flex.flex-col.items-end.max-w-3xl.mb-10.relative img')->attr('src'); $crawler->filter('.by-line address')->each(function ($node) { $author = $node->text(); @@ -41,21 +44,28 @@ class MihaaruScraper $this->author = $cleaneddata; }); - $crawler->filter('article p')->each(function ($node) { - $this->content[] = preg_replace("/[a-zA-Z]/","",$node->text()); + $crawler->filter('.text-faseyha')->each(function ($node) { + $this->content[] = $node->text(); }); - $crawler->filter('.article-tags')->each(function ($node) { - + $crawler->filter('.items-end a')->each(function ($node) { + + try { + $topicName = $node->filter('span')->text(); + $topicSlug = ltrim($node->attr('href'), '/'); + } catch (\Throwable $th) { + return; + } + $this->topics[] = [ - "name" => $node->text(), - "slug" => str_replace("https://mihaaru.com/", "", $node->attr('href')) + "name" => $topicName, + "slug" => $topicSlug ]; }); //Remove all the alphabets from string //preg_replace("/[a-zA-Z]/", "",$string); - return [ + return [ 'source' => 'Mihaaru', 'title' => $this->title, 'og_title' => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'), @@ -63,7 +73,7 @@ class MihaaruScraper 'content' => $this->content, 'url' => $url, 'date' => $date, - 'guid' => str_replace("https://mihaaru.com/news/","",$url), + 'guid' => str_replace("https://mihaaru.com/news/", "", $url), 'author' => $this->author, 'topics' => $this->topics ];