After debug, I find that may be the bug code area:
# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem

class GetTeacherCourseSpider(scrapy.Spider):
    name = 'TeacherCourse'
#    custom_settings = {
#            'ITEM_PIPELINES': {
#                'teacherCourse.pipelines.TeacherCoursePipeline': 300,
#                }
#            }

    def __init__(self, selXNXQ='', titleCode=''):
        self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' 
# first
        self.vcodeUrl = 
'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
        self.postUrl = 
'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
        self.findSessionId = None # to save the cookies
        self.XNXQ = selXNXQ
        self.titleCode = titleCode

    def start_requests(self):
        request = scrapy.Request(self.getUrl,
               callback = self.downloadPic)
        yield request

    def downloadPic(self, response):
        # download the picture
        # find the session id
        self.findSessionId = 
response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
        request = scrapy.Request(self.vcodeUrl,
                cookies= {self.findSessionId[0]: self.findSessionId[1]},
                callback = self.getAndHandleYzm)
        yield request
    
    def getAndHandleYzm(self, response):
        yzm = handle(response.body)
        
        yield FormRequest(self.postUrl,
                formdata={'Sel_XNXQ': '20151',
                          'sel_zc': '011',
                          'txt_yzm': yzm,
                          'type': '2'},
                headers={
                    'Referer': 
'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
                    'Cookie': self.findSessionId[0] + '=' + 
self.findSessionId[1],
                    },


                callback=self.parse)

    def parse(self, response):
        body = response.body.decode('gbk')
# bug code begin I don't know why, I comment them and the pipeline can work
        num = body.find('alert')
        if num != -1:
            # means CAPTCHA validation fails, need to re-request the CAPTCHA
            yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % 
(datetime.now().microsecond / 1000),
            headers={
                    'Referer': 
'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
                    'Cookie': 
self.findSessionId[0]+'='+self.findSessionId[1]
                    },
            callback=self.getAndHandleYzm) # re request the url to solve 
the validation code fail problem
# bug code done
        else:
            # parse data
#            self.parseData(body)
            item = containItem()
            item['first'] = len(body)
            return item
Any suggestions would be appreciate.
 

-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Reply via email to