python 3.x - Passing Scrapy response URL to Selenium and then Selenium response back to Scrapy -
how pass scrapy response url selenium , selenium response scrapy
i have scrapy spider first.py :
# -*- coding: utf-8 -*- import scrapy import re import json class firstspider(scrapy.spider): name = "first" allowed_domains = ["someautosite.co.uk"] start_urls = ( 'http://www.someautosite.co.uk/some_specific_search_results', ) def parse(self, response): car_url in response.xpath('//article[contains(@class, "standard")]/div/div[2]/div[1]/h1/a/@href').extract(): absoluteurl = response.urljoin(car_url) # yield {'url': absoluteurl} yield scrapy.request(absoluteurl, callback=self.parse_car) def parse_car(self, response): pattern = re.compile(r"var utag_data = ({.*?});", re.multiline | re.dotall) utag_data = response.xpath('//script[contains(.,"var utag")]/text()').re(pattern)[0] utag_data_obj = json.loads(utag_data) # make = utag_data_obj['make'] # model = utag_data_obj['model'] # yield {'make':utag_data_obj['make'], # 'model':utag_data_obj['model'], # } # yield utag_data tel = response.xpath('//article/div[3]/section/div/div[@itemprop="telephone"]/text()').extract_first() # tel_json_str = '{"tel":"' + str(tel) + '"}' # tel_json_obj = json.loads(tel_json_str) # combine 2 json objects one: car_json = utag_data_obj.copy() car_json.update({"tel": tel}) yield car_json quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/@href').extract_first() yield scrapy.request(quotations_url, callback=self.parse_quotations) def parse_quotations(self, response): # parse insurance quotation website link selenium import filldata2
and have selenium filldata2.py module tries quotation car url link extracted in parse_car method scrapy spider code above.
now selenum module starts :
from selenium import webdriver selenium.webdriver.common.by import selenium.webdriver.support.ui import webdriverwait # available since 2.4.0 selenium.webdriver.support import expected_conditions ec # available since 2.26.0 selenium.webdriver.common.keys import keys import time import 6 six.moves.configparser import safeconfigparser regno = 'aa00aaa' mile = '15000' firstname = 'john' lastname = 'smith' [...] def yesno(idul): idxpath = '//*[@id="{}"]'.format(idul) return idxpath def findid(idul): found = driver.find_element_by_id(idul) return found def clickyes(idul): idxpath = '//*[@id="{}"]'.format(idul) arg = '{}//span[contains(text(), "yes")]'.format(idxpath) return driver.find_element_by_xpath(arg).click() def clickno(idul): idxpath = '//*[@id="{}"]'.format(idul) arg = '{}//span[contains(text(), "no")]'.format(idxpath) return driver.find_element_by_xpath(arg).click() def clickspan(idul): idxpath = '//*[@id="{}"]'.format(idul) arg = '{}//span[1]'.format(idxpath) driver.find_element_by_xpath(arg).click() class divselect(object): def __init__(self, idul, divtext): self.idul = idul self.divtext = divtext # exemplu: '//div[contains(text(), "right hand")]' # self.divulxpath = '//div[contains(text(), "{}")]'.format(self.divtext) self.idxpath = '//*[@id="{}"]'.format(self.idul) def findid(self): el = 'driver.find_element_by_id({})'.format(self.idul) return el @property def clicky(self): # merge doar la selectare de divuri if len(str(self.divtext)) >= 2 , not self.divtext.isdigit(): arg = '{}//div[contains(text(), "{}")]'.format(self.idxpath, self.divtext) else: arg = '{}//div[{}]/label/div'.format(self.idxpath, self.divtext) print('driver.find_element_by_xpath("{}").click()'.format(arg)) driver.find_element_by_xpath(arg).click() def printval(cee, cssid): def getval(): val = driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).get_attribute('value') if not val: val = input('care e valoarea masinii:\n') driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).click() fillin(cssid, val) time.sleep(2) # print(val) # assert isinstance(val, object) return val valoare = getval() if valoare.lower() == 'pret': print('{} estimat este : £ {} '.format(cee, valoare)) if valoare else 'nu era nici un {}({}) estimat'.format( cee, cssid) else: print('{} estimat/a/e este : {} '.format(cee, valoare)) if valoare else 'nu era nici un {}({}) estimat'.format( cee, cssid) def clickbutton(cssid): driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).click() def fillin(cssid, var): return driver.find_element_by_id(str(cssid)).send_keys(var) def fillinsugestionbox(cssid, var): driver.find_element_by_id(str(cssid)).send_keys(var) return driver.find_element_by_xpath('//*[@id=\"{0}\"]'.format(cssid)).send_keys(keys.return) knowsregistrationnumber = yesno('knows-registration-number').clickyes # 1.2 please enter here started: registrationnumber = driver.find_element_by_id('registration-number') registrationnumber.send_keys(regno) # 1.3 find vehicle find-vehicle-by-reg findvehiclebyreg = driver.find_element_by_id('find-vehicle-by-reg') findvehiclebyreg.click() time.sleep(1) # todo : if no other variants # 1.3.1 multiple-vehicles-section : select list more options # multiplevehiclessection = driver.find_element_by_id('multiple-vehicles-section') # multiplevehiclessection.click() # possible-vehicles : select list id try: element = webdriverwait(driver, 10).until(ec.presence_of_element_located((by.id, "possible-vehicles"))) possiblevehicles = driver.find_element_by_id('possible-vehicles') possiblevehicles.click() print('am asteptat destul') dropdown = possiblevehicles.find_elements_by_tag_name('option') print('am selectat :\n {} \n dintre urmatoarele:'.format(dropdown[1].text)) option in dropdown[1:]: print(option.text) if dropdown: dropdown[1].click() except: print('elementul possible-vehicles nu e prezent') # finally: time.sleep(2) # //*[@id="has-tracker"] yes/no hastracker = yesno('has-tracker').clickno # //*[@id="imported"] yes/no imported = yesno('imported').clickno # //*[@id="steering"] - 2 divs # choose options : # left hand or # right hand steering = divselect('steering', 'right hand').clicky # todo: vezi ce faci daca nu pret setat. pune tu unul # //*[@id="current-value"] - citeste valoarea # driver.find_element_by_xpath('//*[@id="current-value"]') printval('pret', 'current-value') # print('pretul estimat este : £ {} '.format(currentvalue)) if currentvalue else 'nu era nici un pret estimat' printval('scaune', 'numberofseats-dropdown') # //*[@id="has-modifications"] hasmodifications = yesno('has-modifications').clickno # clik next button # //*[@id="vehicle-lookup-next"] clickbutton('vehicle-lookup-next') time.sleep(1) # ============================================ # 2. vehicle usage | # ============================================ # 2.1 when did buy car? # //*[@id="vehicle-usage"]//span[1] vehicleusage = yesno('vehicle-usage').clickspan # haven't bought car yet # 2.2 use car for? # //*[@id="use-of-vehicle"]/ol/li[2]/div[2]/label/div/div[2] # //*[@id="use-of-vehicle"]//div[2] useofvehicle = divselect('use-of-vehicle', '2').clicky # social, domestic, pleasure , commuting (sdpc) # 2.3 annual personal mileage is? # //*[@id="annual-mileage"] annualmileage = driver.find_element_by_id('annual-mileage') annualmileage.send_keys(mile) [...much more...] ... ... fillin('email', email) # main telephone number # let insurance providers answer queries # let keep date # //*[@id="communication-options"]/ol/li[2]/div[4]/label/div/div[2] divselect('communication-options', 'post').clicky # please tick box confirm have read , understood our website terms , conditions, \ # assumptions may have made , rewards terms , conditions. \ # if not understand items within document please contact us. # //*[@id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span # yesno('contact-details').clickspan - nu merge driver.find_element_by_xpath('//*[@id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span').click() # //*[@id="contact-details-next"] clickbutton('contact-details-next') driver.implicitly_wait(10) try: element = webdriverwait(driver, 10).until(ec.visibility_of_element_located((by.id, "quotes"))) print('element = ', element) try: """ wait loading bar go away: """ element2 = webdriverwait(driver, 60).until(ec.invisibility_of_element_located((by.xpath, '//*[@id="quotes-loading-container"]/div/div[1]'))) print('element2 = ', element2) except: print('bara de loading inca este activa. butonul more details cu cotatii nu e vizibil') except: print('tabelul cu cotatii nu e vizibil') source_code = driver.find_element_by_id('quotes').get_attribute('innerhtml') # element.get_attribute('innerhtml') f = open('c:\\users\\zzz\\pycharmprojects\\selenscrapy\\'+str(regno)+'.html', 'wb') f.write(source_code.encode('utf-8')) f.close()
i know code messy. i'm python beginner , i'm playing code scrap cars car selling website , try insurance quotation them different site. link outer insurance quotation site (full of javascript, that's why need selenium webdriver) redirection link car selling site, because 2 sites collaborate. now, said before, quotation url needs parsed selenium, keep on separate module file, maybe 2 separate files, 1 config , 1 actions taken.
how pass insurance quotation url obtained scrapy firstspider parse_car() method selenium module , response of selenium script (which called source_code in second module above) scrapy in firstspider parse_quotations() method.
thank !
Comments
Post a Comment