日 15 五月 2016 | tags: Scrapy, Python, -- (permalink)

scrapy 模拟登录亚马逊

start_urls = ['https://associates.amazon.cn/gp/associates/network/reports/report.html']
def __init__(self, username, password, *args, **kwargs):
    super(AmazonSpider, self).__init__(*args, **kwargs)
    self.http_user = username
    self.http_pass = password
    #login form
    self.formdata = {'create':'0',\
                    'email':self.http_user, \
                    'password':self.http_pass,\
                    }   
    self.headers = {'ccept-Charset':'GBK,utf-8;q=0.7,*;q=0.3',\
                    'Accept-Encoding':'gzip,deflate,sdch',\
                    'Accept-Language':'zh-CN,zh;q=0.8',\
                    'Cache-Control':'max-age=0',\
                    'Connection':'keep-alive',\
                    }   
    self.id = 0 

def start_requests(self):
    for i, url in enumerate(self.start_urls):
        yield FormRequest(url, meta = {'cookiejar': i},\
                            #formdata = self.formdata,\
                            headers = self.headers,\
                            callback = self.login)#jump to login page

def _log_page(self, response, filename):
    with open(filename, 'w') as f:
        f.write("%s\n%s\n%s\n" % (response.url, response.headers, response.body))
def login(self, response):
    self._log_page(response, 'amazon_login.html')
    return [FormRequest.from_response(response, \
                        formdata = self.formdata,\
                        headers = self.headers,\
                        meta = {'cookiejar':response.meta['cookiejar']},\
                        callback = self.parse_item)]#success login

def parse_item(self, response):
    self._log_page(response, 'after_login.html')
    hxs = HtmlXPathSelector(response)
    report_urls = hxs.select('//div[@id="menuh"]/ul/li[4]/div//a/@href').extract()
    for report_url in report_urls:
        #print "list:"+report_url
        yield Request(self._ab_path(response, report_url),\
                        headers = self.headers,\
                        meta = {'cookiejar':response.meta['cookiejar'],\
                                },\
                        callback = self.parse_report)

def parse_report(self, response):
    self.id ++
    self._log_page(response, "%d.html" %self.id)