diff --git a/app/Console/Commands/ScrapeMihaaruCommand.php b/app/Console/Commands/ScrapeMihaaruCommand.php new file mode 100644 index 0000000..e3137fa --- /dev/null +++ b/app/Console/Commands/ScrapeMihaaruCommand.php @@ -0,0 +1,42 @@ +get("https://mihaaru.com/rss")["channel"]["item"]; + + $articlesitems = []; + $emihaaru = new MihaaruScraper(); + + foreach ($articles as $article) { + $link = $article['link']; + $date = $article['pubDate']; + $guid = $article['guid']; + $articlesitems[] = $emihaaru->extract($link, $date, $guid); + } + + return $articlesitems; + } +} diff --git a/app/Services/Scrapers/MihaaruScraper.php b/app/Services/Scrapers/MihaaruScraper.php new file mode 100644 index 0000000..e94af7e --- /dev/null +++ b/app/Services/Scrapers/MihaaruScraper.php @@ -0,0 +1,75 @@ +client = new Client; + } + + public function extract($url) + { + + $crawler = $this->client->request('GET', $url); + + $crawler->filter('h1')->each(function ($node) { + $title = $node->text(); + $this->title = $title; + }); + + $crawler->filter('.container img')->eq(3)->each(function ($node) { + $image = $node->attr('src'); + $this->image = $image; + }); + + $crawler->filter('.by-line address')->each(function ($node) { + $author = $node->text(); + //Trim all the white spaces + $spacetrim = str_replace(' ', '', $author); + //Replace multiple spaces and newlines with a single space + $cleaneddata = trim(preg_replace('/\s\s+/', ' ', $spacetrim)); + $this->author = $cleaneddata; + }); + + $crawler->filter('article')->each(function ($node) { + $content = $node->text(); + + $input = str_replace("\n", '', $content); + $this->content = $input; + }); + + $crawler->filter('.article-tags')->each(function ($node) { + $tags[] = [ + "name" => $node->text(), + "slug" => str_replace("https://mihaaru.com/", "", $node->attr('href')) + ]; + $this->tags[] = $tags; + }); + + //Remove all the alphabets from string + //preg_replace("/[a-zA-Z]/", "",$string); + $data = [ + 'source' => 'Mihaaru', + 'title' => $this->title, + 'image' => $this->image, + 'content' => $this->content, + 'url' => $url, + 'author' => $this->author, + 'topics' => $this->tags, + ]; + + return $data; + } +} diff --git a/composer.json b/composer.json index 01d5b09..4da691f 100644 --- a/composer.json +++ b/composer.json @@ -9,6 +9,7 @@ "license": "MIT", "require": { "php": "^7.2.5", + "fabpot/goutte": "^4.0", "fideloper/proxy": "^4.2", "fruitcake/laravel-cors": "^2.0", "goldspecdigital/laravel-eloquent-uuid": "^7.0", diff --git a/composer.lock b/composer.lock index 9103290..71fc5cd 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "8015ef38fcd1634dba0af7dfd8d3c8f6", + "content-hash": "4258a59a9dc5e5b92894c963d718a365", "packages": [ { "name": "asm89/stack-cors", @@ -388,6 +388,62 @@ ], "time": "2020-06-16T20:11:17+00:00" }, + { + "name": "fabpot/goutte", + "version": "v4.0.0", + "source": { + "type": "git", + "url": "https://github.com/FriendsOfPHP/Goutte.git", + "reference": "05f6994ec1d0d8368157de7fe45063e751857086" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/FriendsOfPHP/Goutte/zipball/05f6994ec1d0d8368157de7fe45063e751857086", + "reference": "05f6994ec1d0d8368157de7fe45063e751857086", + "shasum": "" + }, + "require": { + "php": "^7.1.3", + "symfony/browser-kit": "^4.4|^5.0", + "symfony/css-selector": "^4.4|^5.0", + "symfony/dom-crawler": "^4.4|^5.0", + "symfony/http-client": "^4.4|^5.0", + "symfony/mime": "^4.4|^5.0" + }, + "require-dev": { + "symfony/phpunit-bridge": "^5.0" + }, + "type": "application", + "extra": { + "branch-alias": { + "dev-master": "4.0-dev" + } + }, + "autoload": { + "psr-4": { + "Goutte\\": "Goutte" + }, + "exclude-from-classmap": [ + "Goutte/Tests" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + } + ], + "description": "A simple PHP Web Scraper", + "homepage": "https://github.com/FriendsOfPHP/Goutte", + "keywords": [ + "scraper" + ], + "time": "2019-12-06T13:44:35+00:00" + }, { "name": "fideloper/proxy", "version": "4.4.0", @@ -2122,6 +2178,65 @@ ], "time": "2019-11-12T09:31:26+00:00" }, + { + "name": "symfony/browser-kit", + "version": "v5.1.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/browser-kit.git", + "reference": "b9545e08790be2d3d7d92306e339bbcd79f461e4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/browser-kit/zipball/b9545e08790be2d3d7d92306e339bbcd79f461e4", + "reference": "b9545e08790be2d3d7d92306e339bbcd79f461e4", + "shasum": "" + }, + "require": { + "php": ">=7.2.5", + "symfony/dom-crawler": "^4.4|^5.0" + }, + "require-dev": { + "symfony/css-selector": "^4.4|^5.0", + "symfony/http-client": "^4.4|^5.0", + "symfony/mime": "^4.4|^5.0", + "symfony/process": "^4.4|^5.0" + }, + "suggest": { + "symfony/process": "" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "5.1-dev" + } + }, + "autoload": { + "psr-4": { + "Symfony\\Component\\BrowserKit\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony BrowserKit Component", + "homepage": "https://symfony.com", + "time": "2020-06-24T13:36:18+00:00" + }, { "name": "symfony/console", "version": "v5.1.3", @@ -2304,6 +2419,68 @@ "homepage": "https://symfony.com", "time": "2020-06-06T08:49:21+00:00" }, + { + "name": "symfony/dom-crawler", + "version": "v5.1.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "a96aecb36aaf081f1b012e1e62d71f1069ab3dca" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/a96aecb36aaf081f1b012e1e62d71f1069ab3dca", + "reference": "a96aecb36aaf081f1b012e1e62d71f1069ab3dca", + "shasum": "" + }, + "require": { + "php": ">=7.2.5", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0", + "symfony/polyfill-php80": "^1.15" + }, + "conflict": { + "masterminds/html5": "<2.6" + }, + "require-dev": { + "masterminds/html5": "^2.6", + "symfony/css-selector": "^4.4|^5.0" + }, + "suggest": { + "symfony/css-selector": "" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "5.1-dev" + } + }, + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony DomCrawler Component", + "homepage": "https://symfony.com", + "time": "2020-07-23T08:36:24+00:00" + }, { "name": "symfony/error-handler", "version": "v5.1.3", @@ -2544,6 +2721,139 @@ "homepage": "https://symfony.com", "time": "2020-05-20T17:43:50+00:00" }, + { + "name": "symfony/http-client", + "version": "v5.1.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/http-client.git", + "reference": "050dc633a598bdadbd49449500c87e30dabe5c58" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/http-client/zipball/050dc633a598bdadbd49449500c87e30dabe5c58", + "reference": "050dc633a598bdadbd49449500c87e30dabe5c58", + "shasum": "" + }, + "require": { + "php": ">=7.2.5", + "psr/log": "^1.0", + "symfony/http-client-contracts": "^2.1.1", + "symfony/polyfill-php73": "^1.11", + "symfony/polyfill-php80": "^1.15", + "symfony/service-contracts": "^1.0|^2" + }, + "provide": { + "php-http/async-client-implementation": "*", + "php-http/client-implementation": "*", + "psr/http-client-implementation": "1.0", + "symfony/http-client-implementation": "1.1" + }, + "require-dev": { + "amphp/http-client": "^4.2.1", + "amphp/http-tunnel": "^1.0", + "amphp/socket": "^1.1", + "guzzlehttp/promises": "^1.3.1", + "nyholm/psr7": "^1.0", + "php-http/httplug": "^1.0|^2.0", + "psr/http-client": "^1.0", + "symfony/dependency-injection": "^4.4|^5.0", + "symfony/http-kernel": "^4.4|^5.0", + "symfony/process": "^4.4|^5.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "5.1-dev" + } + }, + "autoload": { + "psr-4": { + "Symfony\\Component\\HttpClient\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony HttpClient component", + "homepage": "https://symfony.com", + "time": "2020-07-06T13:23:11+00:00" + }, + { + "name": "symfony/http-client-contracts", + "version": "v2.1.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/http-client-contracts.git", + "reference": "cd88921e9add61f2064c9c6b30de4f589db42962" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/http-client-contracts/zipball/cd88921e9add61f2064c9c6b30de4f589db42962", + "reference": "cd88921e9add61f2064c9c6b30de4f589db42962", + "shasum": "" + }, + "require": { + "php": ">=7.2.5" + }, + "suggest": { + "symfony/http-client-implementation": "" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.1-dev" + }, + "thanks": { + "name": "symfony/contracts", + "url": "https://github.com/symfony/contracts" + } + }, + "autoload": { + "psr-4": { + "Symfony\\Contracts\\HttpClient\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Generic abstractions related to HTTP clients", + "homepage": "https://symfony.com", + "keywords": [ + "abstractions", + "contracts", + "decoupling", + "interfaces", + "interoperability", + "standards" + ], + "time": "2020-07-06T13:23:11+00:00" + }, { "name": "symfony/http-foundation", "version": "v5.1.3",