Commands and scraper
This commit is contained in:
		
							
								
								
									
										42
									
								
								app/Console/Commands/ScrapeMihaaruCommand.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								app/Console/Commands/ScrapeMihaaruCommand.php
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,42 @@
 | 
			
		||||
<?php
 | 
			
		||||
 | 
			
		||||
namespace App\Console\Commands;
 | 
			
		||||
 | 
			
		||||
use Illuminate\Console\Command;
 | 
			
		||||
 | 
			
		||||
class ScrapeMihaaruCommand extends Command
 | 
			
		||||
{
 | 
			
		||||
    /**
 | 
			
		||||
     * The name and signature of the console command.
 | 
			
		||||
     *
 | 
			
		||||
     * @var string
 | 
			
		||||
     */
 | 
			
		||||
    protected $signature = 'scrape:mihaaru';
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * The console command description.
 | 
			
		||||
     *
 | 
			
		||||
     * @var string
 | 
			
		||||
     */
 | 
			
		||||
    protected $description = 'Scrape Mihaaru mv';
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Create a new command instance.
 | 
			
		||||
     *
 | 
			
		||||
     * @return void
 | 
			
		||||
     */
 | 
			
		||||
    public function __construct()
 | 
			
		||||
    {
 | 
			
		||||
        parent::__construct();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Execute the console command.
 | 
			
		||||
     *
 | 
			
		||||
     * @return int
 | 
			
		||||
     */
 | 
			
		||||
    public function handle()
 | 
			
		||||
    {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										31
									
								
								app/Services/MihaaruService.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								app/Services/MihaaruService.php
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,31 @@
 | 
			
		||||
<?php
 | 
			
		||||
 | 
			
		||||
namespace App\Services;
 | 
			
		||||
 | 
			
		||||
use App\Services\Scrapers\MihaaruScraper;
 | 
			
		||||
 | 
			
		||||
class MihaaruService extends Client
 | 
			
		||||
{    
 | 
			
		||||
    /**
 | 
			
		||||
     * Scrap all the rss articles from mihaaru
 | 
			
		||||
     *
 | 
			
		||||
     * @return array
 | 
			
		||||
     */
 | 
			
		||||
    public function scrape() : array
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        $articles = $this->get("https://mihaaru.com/rss")["channel"]["item"];
 | 
			
		||||
        
 | 
			
		||||
        $articlesitems = [];
 | 
			
		||||
        $emihaaru = new MihaaruScraper();
 | 
			
		||||
 | 
			
		||||
        foreach ($articles as $article) {
 | 
			
		||||
            $link = $article['link'];
 | 
			
		||||
            $date = $article['pubDate'];
 | 
			
		||||
            $guid = $article['guid'];
 | 
			
		||||
            $articlesitems[] = $emihaaru->extract($link, $date, $guid);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return $articlesitems;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										75
									
								
								app/Services/Scrapers/MihaaruScraper.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								app/Services/Scrapers/MihaaruScraper.php
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,75 @@
 | 
			
		||||
<?php
 | 
			
		||||
 | 
			
		||||
namespace App\Services\Scrapers;
 | 
			
		||||
 | 
			
		||||
use Goutte\Client;
 | 
			
		||||
 | 
			
		||||
class MihaaruScraper
 | 
			
		||||
{
 | 
			
		||||
    protected $client;
 | 
			
		||||
 | 
			
		||||
    protected $title;
 | 
			
		||||
    protected $content;
 | 
			
		||||
    protected $image;
 | 
			
		||||
    protected $tags = [];
 | 
			
		||||
    protected $author;
 | 
			
		||||
 | 
			
		||||
    public function __construct()
 | 
			
		||||
    {
 | 
			
		||||
        $this->client = new Client;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    public function extract($url)
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        $crawler = $this->client->request('GET', $url);
 | 
			
		||||
 | 
			
		||||
        $crawler->filter('h1')->each(function ($node) {
 | 
			
		||||
            $title = $node->text();
 | 
			
		||||
            $this->title = $title;
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        $crawler->filter('.container  img')->eq(3)->each(function ($node) {
 | 
			
		||||
            $image = $node->attr('src');
 | 
			
		||||
            $this->image = $image;
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        $crawler->filter('.by-line address')->each(function ($node) {
 | 
			
		||||
            $author = $node->text();
 | 
			
		||||
            //Trim all the white spaces
 | 
			
		||||
            $spacetrim = str_replace(' ', '', $author);
 | 
			
		||||
            //Replace multiple spaces and newlines with a single space
 | 
			
		||||
            $cleaneddata = trim(preg_replace('/\s\s+/', ' ', $spacetrim));
 | 
			
		||||
            $this->author = $cleaneddata;
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        $crawler->filter('article')->each(function ($node) {
 | 
			
		||||
            $content = $node->text();
 | 
			
		||||
 | 
			
		||||
            $input = str_replace("\n", '', $content);
 | 
			
		||||
            $this->content = $input;
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        $crawler->filter('.article-tags')->each(function ($node) {
 | 
			
		||||
            $tags[] = [
 | 
			
		||||
                "name" => $node->text(),
 | 
			
		||||
                "slug" => str_replace("https://mihaaru.com/", "", $node->attr('href'))
 | 
			
		||||
            ];
 | 
			
		||||
            $this->tags[] = $tags;
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        //Remove all the alphabets from string
 | 
			
		||||
        //preg_replace("/[a-zA-Z]/", "",$string);
 | 
			
		||||
        $data = [
 | 
			
		||||
            'source'    => 'Mihaaru',
 | 
			
		||||
            'title'      => $this->title,
 | 
			
		||||
            'image'      => $this->image,
 | 
			
		||||
            'content'    => $this->content,
 | 
			
		||||
            'url'        => $url,
 | 
			
		||||
            'author'     => $this->author,
 | 
			
		||||
            'topics'       => $this->tags,
 | 
			
		||||
        ];
 | 
			
		||||
 | 
			
		||||
        return $data;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user