Fix mihaaru scraper
This commit is contained in:
		| @@ -25,9 +25,9 @@ class Kernel extends ConsoleKernel | ||||
|      */ | ||||
|     protected function schedule(Schedule $schedule) | ||||
|     { | ||||
|         // $schedule->command('scrape:mihaaru')->everyFiveMinutes() | ||||
|         // ->runInBackground() | ||||
|         //     ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru"); | ||||
|         $schedule->command('scrape:mihaaru')->everyFiveMinutes() | ||||
|         ->runInBackground() | ||||
|             ->pingOnSuccess(config('app.url') . "/api/ping/mihaaru"); | ||||
|  | ||||
|         $schedule->command('scrape:sun')->everyFiveMinutes() | ||||
|             ->runInBackground() | ||||
|   | ||||
| @@ -20,7 +20,6 @@ class SourcesAPIController extends Controller | ||||
|     { | ||||
|         return Cache::remember('sources.index', 300, function () { | ||||
|             return SourceResource::collection(Source::whereNotIn('slug',[ | ||||
|                 'mihaaru', | ||||
|                 'hama', | ||||
|                 'zaviyani', | ||||
|                 'funadhoo-times', | ||||
|   | ||||
							
								
								
									
										68
									
								
								app/Services/Feeds/MihaaruFeed.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								app/Services/Feeds/MihaaruFeed.php
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,68 @@ | ||||
| <?php | ||||
| namespace App\Services\Feeds; | ||||
|  | ||||
| use Illuminate\Support\Facades\Http; | ||||
| use Carbon\Carbon; | ||||
|  | ||||
| class MihaaruFeed implements Feed | ||||
| { | ||||
|     /** | ||||
|      * Get all the latest news | ||||
|      * | ||||
|      * @return array | ||||
|      */ | ||||
|     public function get() : array | ||||
|     { | ||||
|         $response = Http::withOptions([ | ||||
|             'proxy' => config('karudhaas.proxy.host') | ||||
|          ])->withHeaders([ | ||||
|             'Referer' => 'https://mihaaru.com/?ref=mhr-lm', | ||||
|         ]) | ||||
|         ->get('https://mihaaru.com/api/home/latest-popular-weekly?type=latest') | ||||
|         ->json(); | ||||
|  | ||||
|         $feeds = []; | ||||
|         foreach ($response['data'] as $item) { | ||||
|             // Approximate the date from the human-readable format | ||||
|             $date = $this->approximateDateFromHumanTime($item['human_time']); | ||||
|  | ||||
|             $feeds[] = [ | ||||
|                 "title" => $item['short_headline'], | ||||
|                 "link" => $item['link'], | ||||
|                 "date" => $date | ||||
|             ]; | ||||
|         } | ||||
|  | ||||
|         return $feeds; | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Approximates the date from a human-readable time format. | ||||
|      * | ||||
|      * @param string $humanTime | ||||
|      * @return string | ||||
|      */ | ||||
|     protected function approximateDateFromHumanTime($humanTime) | ||||
|     { | ||||
|         $now = Carbon::now(); | ||||
|  | ||||
|         // Example pattern: "11 hr", "1 day" | ||||
|         if(preg_match('/(\d+)\s*(hr|hour|day|days)/', $humanTime, $matches)) { | ||||
|             $number = $matches[1]; | ||||
|             $unit = $matches[2]; | ||||
|  | ||||
|             switch ($unit) { | ||||
|                 case 'hr': | ||||
|                 case 'hour': | ||||
|                     return $now->subHours($number)->toDateTimeString(); | ||||
|                 case 'day': | ||||
|                 case 'days': | ||||
|                     return $now->subDays($number)->toDateTimeString(); | ||||
|                 default: | ||||
|                     return $now->toDateTimeString(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return $now->toDateTimeString(); | ||||
|     } | ||||
| } | ||||
| @@ -2,6 +2,7 @@ | ||||
|  | ||||
| namespace App\Services; | ||||
|  | ||||
| use App\Services\Feeds\MihaaruFeed; | ||||
| use App\Services\Scrapers\MihaaruScraper; | ||||
| use Illuminate\Support\Str; | ||||
|  | ||||
| @@ -15,17 +16,11 @@ class MihaaruService extends Client | ||||
|     public function scrape(): array | ||||
|     { | ||||
|         //Return only the rss that contains "news" keyboard in its url | ||||
|         $articles = collect($this->get("https://mihaaru.com/rss")["channel"]["item"]) | ||||
|             ->filter(function ($item, $key) { | ||||
|                 return Str::of($item["link"])->contains(['news']); | ||||
|             }); | ||||
|  | ||||
|         $articles = (new MihaaruFeed)->get(); | ||||
|         $articlesitems = []; | ||||
|         //Looping through the articles and scraping and while scraping it creates a new instance of the scraper. | ||||
|         foreach ($articles as $article) { | ||||
|             $link = $article['link']; | ||||
|             $date = $article['pubDate']; | ||||
|             $articlesitems[] = (new MihaaruScraper)->extract($link, $date); | ||||
|             $articlesitems[] = (new MihaaruScraper)->extract($article['link'], $article['date']); | ||||
|         } | ||||
|  | ||||
|         return $articlesitems; | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
| namespace App\Services\Scrapers; | ||||
|  | ||||
| use Goutte\Client; | ||||
| use Symfony\Component\HttpClient\HttpClient; | ||||
|  | ||||
| class MihaaruScraper | ||||
| { | ||||
| @@ -16,7 +17,11 @@ class MihaaruScraper | ||||
|  | ||||
|     public function __construct() | ||||
|     { | ||||
|         $this->client = new Client; | ||||
|         $this->client = new Client( | ||||
|             HttpClient::create([ | ||||
|                 "proxy" => config('karudhaas.proxy.host') | ||||
|             ]) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     public function extract($url, $date = null) | ||||
| @@ -28,9 +33,7 @@ class MihaaruScraper | ||||
|             $this->title = $node->text(); | ||||
|         }); | ||||
|  | ||||
|         $crawler->filter('.container  img')->eq(3)->each(function ($node) { | ||||
|             $this->image = $node->attr('src'); | ||||
|         }); | ||||
|         $this->image = $crawler->filter('.w-full.flex.flex-col.items-end.max-w-3xl.mb-10.relative img')->attr('src'); | ||||
|  | ||||
|         $crawler->filter('.by-line address')->each(function ($node) { | ||||
|             $author = $node->text(); | ||||
| @@ -41,21 +44,28 @@ class MihaaruScraper | ||||
|             $this->author = $cleaneddata; | ||||
|         }); | ||||
|  | ||||
|         $crawler->filter('article p')->each(function ($node) { | ||||
|             $this->content[] = preg_replace("/[a-zA-Z]/","",$node->text()); | ||||
|         $crawler->filter('.text-faseyha')->each(function ($node) { | ||||
|             $this->content[] = $node->text(); | ||||
|         }); | ||||
|  | ||||
|         $crawler->filter('.article-tags')->each(function ($node) { | ||||
|             | ||||
|         $crawler->filter('.items-end a')->each(function ($node) { | ||||
|  | ||||
|             try { | ||||
|                 $topicName = $node->filter('span')->text(); | ||||
|                 $topicSlug =  ltrim($node->attr('href'), '/'); | ||||
|             } catch (\Throwable $th) { | ||||
|                 return; | ||||
|             } | ||||
|  | ||||
|             $this->topics[] = [ | ||||
|                 "name" => $node->text(), | ||||
|                 "slug" => str_replace("https://mihaaru.com/", "", $node->attr('href')) | ||||
|                 "name" => $topicName, | ||||
|                 "slug" => $topicSlug | ||||
|             ]; | ||||
|         }); | ||||
|  | ||||
|         //Remove all the alphabets from string | ||||
|         //preg_replace("/[a-zA-Z]/", "",$string); | ||||
|        return [ | ||||
|         return [ | ||||
|             'source'    => 'Mihaaru', | ||||
|             'title'      => $this->title, | ||||
|             'og_title'   => $crawler->filter('meta[property*="og:title"]')->first()->attr('content'), | ||||
| @@ -63,7 +73,7 @@ class MihaaruScraper | ||||
|             'content'    => $this->content, | ||||
|             'url'        => $url, | ||||
|             'date'       => $date, | ||||
|             'guid'       => str_replace("https://mihaaru.com/news/","",$url), | ||||
|             'guid'       => str_replace("https://mihaaru.com/news/", "", $url), | ||||
|             'author'     => $this->author, | ||||
|             'topics'       => $this->topics | ||||
|         ]; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user