Alexandre Carette Posted May 16, 2017 Share Posted May 16, 2017 (edited) Objectif: Permettre de récupérer les produits d'un site Prestashop en format CSV en vue d'une importation future Autre article sur le même thème : Scraping d'un site Prestashop avec puppetter Disclaimer: https://fr.wikipedia.org/wiki/Web_scraping OS utilisé: Ubuntu 17.04 Version de scrapy 1.3.3 Version de prestashop 1.6.13 Theme: default Instalation de scrapy (framework de web crawling en python) dans un terminal: sudo apt install python-pip sudo pip install Scrapy 1) Création du projet: scrapy startproject prestashop16 2) On édite item.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class Prestashop16Item(Item): # define the fields for your item here like: # name = scrapy.Field() url = Field() balise_title = Field() balise_meta_description = Field() h1 = Field() reference = Field() quantity = Field() description_courte = Field() description_longue = Field() prix_ttc = Field() images = Field() main_image = Field() pass 3) On édite settings.py # -*- coding: utf-8 -*- # Scrapy settings for prestashop16 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'prestashop16' SPIDER_MODULES = ['prestashop16.spiders'] NEWSPIDER_MODULE = 'prestashop16.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # Ou on utilise google bot # USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' USER_AGENT = 'prestashop16 (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # On crawl une page par seconde DOWNLOAD_DELAY = 1 # On enregistre les données dans un fichier CSV FEED_URI = '/home/nom_utilisateur/desktop/liste_produits_prestashop.csv' # On veut un CSV FEED_FORMAT ='csv' FEED_EXPORTERS_BASE = { 'csv':'scrapy.exporters.CsvItemExporter', } # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'prestashop16.middlewares.Prestashop16SpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'prestashop16.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'prestashop16.pipelines.Prestashop16Pipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 4) On crée dans le dossiers spiders -> le fichier presta_bot.py #!/usr/bin/env python # -*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider from scrapy.linkextractors import LinkExtractor from prestashop16.items import Prestashop16Item from scrapy.selector import Selector from scrapy.http import Request class prestashop16(CrawlSpider): name="presta_bot" #on autorise seulement le crawl du site indiqué dans allowed_domains allowed_domains = ['demo-prestashop-16.terracode.de'] # on definit l'id du produit de départ start_id_product = 1 # on definit l'id du produit de fin end_id_product = 5 #on boucle la requete sur la rangée d'id def start_requests(self): for i in range(self.start_id_product,self.end_id_product): yield Request('https://demo-prestashop-16.terracode.de/index.php?controller=product&id_product=%d' % i, callback=self.parse_items) def parse_items(self,response): #récupération des datas récoltées (contenu de la page produit) sel = Selector(response) #on prépare item item = Prestashop16Item() item['url'] = response.url item['balise_title'] = sel.xpath('//title/text()').extract() item['balise_meta_description'] = sel.xpath('/html/head/meta[@name="description"]/@content').extract() item['h1'] = sel.xpath('//h1/text()').extract() item['reference'] = sel.xpath('//span[contains(@itemprop, "sku")]/@content').extract() item['quantity'] = sel.xpath('//span[@id="quantityAvailable"]/text()').extract() item['description_courte'] = sel.xpath('//div[@id="short_description_content"]//p/text()').extract() item['description_longue'] = sel.xpath('//section[@class="page-product-box"]//div[@class="rte"]//p/text()').extract() item['prix_ttc'] = sel.xpath('//span[contains(@itemprop, "price")]/@content').extract() item['images'] = sel.xpath('//ul[@id="thumbs_list_frame"]/li/a/@href').extract() item['main_image'] = sel.xpath('//div[@id="image-block"]//span[@id="view_full_size"]//img/@src').extract() # on fait passer item à la suite du processus yield item 5) on lance le bot alexandre@ordi-alexandre:~/prestashop16$ scrapy crawl presta_bot 6) on recupere le csv sur le bureau Edited October 21, 2024 by Alexandre Carette (see edit history) 2 Link to comment Share on other sites More sharing options...
Alexandre Carette Posted May 18, 2017 Author Share Posted May 18, 2017 (edited) Objectif: Synchroniser les stocks produits (sans déclinaisons) d'un site prestashop A vers un site prestashop B - On utilise ici l'id du produit comme clef, donc le site A et B doivent avoir les mêmes id_product 1) Installer scrapy sur votre serveur 2) On crée un nouveau projet dans /home/mes_crawlers/ scrapy startproject ps16stock 2) On édite item.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class Ps16StockItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() quantity = Field() id_product = Field() pass 3) On édite settings.py on active ITEM_PIPELINES # -*- coding: utf-8 -*- # Scrapy settings for ps16stock project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'ps16stock' SPIDER_MODULES = ['ps16stock.spiders'] NEWSPIDER_MODULE = 'ps16stock.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'ps16stock (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'ps16stock.middlewares.Ps16StockSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'ps16stock.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'ps16stock.pipelines.Ps16StockPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 4) On crée dans le dossiers spiders -> le fichier stock_bot.py #!/usr/bin/env python # -*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider from scrapy.linkextractors import LinkExtractor from ps16stock.items import Ps16StockItem from scrapy.selector import Selector from scrapy.http import Request class Ps16Stock(CrawlSpider): name="stock_bot" #on autorise seulement le crawl du site indiqué dans allowed_domains allowed_domains = ['demo-prestashop-16.terracode.de'] # on definit l'id du produit de départ start_id_product = 1 # on definit l'id du produit de fin end_id_product = 5 #on boucle la requete sur la rangée d'id def start_requests(self): for i in range(self.start_id_product,self.end_id_product): yield Request('https://demo-prestashop-16.terracode.de/index.php?controller=product&id_product=%' % i, callback=self.parse_items) def parse_items(self,response): #récupération des datas récoltées (contenu de la page produit) sel = Selector(response) #on prépare item item = Ps16StockItem() item['id_product'] = sel.xpath('//input[@type="hidden"][@name="id_product"]/@value').extract()[0] item['quantity'] = sel.xpath('//span[@id="quantityAvailable"]/text()').extract()[0] return item 5) on édite pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import MySQLdb class Ps16StockPipeline(object): def __init__(self): print "Opening connection mysql..." self.conn = MySQLdb.connect( #utilisateur mySQL user='utilisateur_mysql', #password mySQL passwd='password', #nom de la bdd db='scrapy1', #adresse du serveur mySql host='localhost', charset='utf8', use_unicode=True ) self.cursor = self.conn.cursor() print "Opening ok" def process_item(self, item, spider): try: self.cursor.execute("""UPDATE ps_stock_available SET quantity=%s WHERE id_product=%s""", (item['quantity'], item['id_product'])) self.conn.commit() except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) return item 6) On crée un fichier bash ps16stock.sh #!/bin/bash cd /home/mes_crawlers/ PATH=$PATH:/usr/local/bin export PATH scrapy crawl stock_bot 7) on met en place une tâche cron tout les jours à 5 heure du mat crontab -e * 5 * * * sh /myfolder/crawlers/ps16stock.sh Edited May 18, 2017 by Alexandre Carette (see edit history) Link to comment Share on other sites More sharing options...
dandumit Posted June 30, 2019 Share Posted June 30, 2019 On 5/17/2017 at 12:45 AM, Alexandre Carette said: 6) on recupere le csv sur le bureau Merci beaucoup ! (that's all my French) Please tell me , is there any way to get the translated version of site ? like translation from Chrome ? Thank you, Daniel Link to comment Share on other sites More sharing options...
Aboumalak Posted February 11, 2020 Share Posted February 11, 2020 Bonjour Alexandre, Merci pour ce tuto. J'ai suivi à la lettre les étapes, je reçois malheuresuement une erreur , :( la suivante : scrapy crawl presta_bot Traceback (most recent call last): File "/usr/local/bin/scrapy", line 11, in <module> sys.exit(execute()) File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 145, in execute cmd.crawler_process = CrawlerProcess(settings) File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 267, in __init__ super(CrawlerProcess, self).__init__(settings) File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 145, in __init__ self.spider_loader = _get_spider_loader(settings) File "/usr/local/lib/python2.7/dist-packages/scrapy/crawler.py", line 347, in _get_spider_loader return loader_cls.from_settings(settings.frozencopy()) File "/usr/local/lib/python2.7/dist-packages/scrapy/spiderloader.py", line 61, in from_settings return cls(settings) File "/usr/local/lib/python2.7/dist-packages/scrapy/spiderloader.py", line 25, in __init__ self._load_all_spiders() File "/usr/local/lib/python2.7/dist-packages/scrapy/spiderloader.py", line 47, in _load_all_spiders for module in walk_modules(name): File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 73, in walk_modules submod = import_module(fullpath) File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module __import__(name) File "/home/omar/prestashop16/prestashop16/spiders/presta_bot.py", line 5, in <module> from prestashop16.items import Prestashop16Item File "/home/omar/prestashop16/prestashop16/items.py", line 11, in <module> class Prestashop16Item(scrapy.Item): NameError: name 'scrapy' is not defined Link to comment Share on other sites More sharing options...
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now