To scrape a particular website 180 days into the future, an authentication
token must be obtained in order to get the json data to scrape. While
scraping, the token expires and the HTTP response returns a status code of
401 "Unauthorized". How do I get a new token into the scraper and continue
scraping? Any help is appreciated.
def start_requests(self):
return [Request(url=AUTHORIZATION_URL, callback=self.request_ride_times)]
def request_ride_times(self, response):
# parse json data
data = json.loads(response.body)
# get auth token
auth = '{}'.format(data['access_token'])
# set auth token in headers
headers = {'Authorization': 'BEARER {}'.format(auth)}
# note: this probably isn't really necessary but it doesn't hurt (all the
sites times we are scraping are in EST)
now = get_current_time_for_timezone("US/Eastern")
# get ending timeframe for scraping dates - 190 days out
until = now + SCRAPE_TIMEFRAME
for filter_type in FILTER_TYPES:
filter_url_query_attr = '&filters={}'.format(filter_type)
scrape_date = now
while scrape_date <= until:
url = urljoin(SCRAPE_BASE_URL,
'{}{}&date={}'.format(SCRAPE_BASE_URL_QUERY_STRING, filter_url_query_attr,
scrape_date.strftime("%Y-%m-%d")))
yield Request(url, headers=headers, callback=self.parse_ride_times,
errback=self.error_handler)
scrape_date += timedelta(days=1)
def parse_ride_times(self, response):
# parse json data
data = json.loads(response.body)
for index, ride_details in enumerate(data['results']):
if 'schedule' not in ride_details:
continue
ride_schedule = ride_details['schedule']
# create item...
yield item
--
You received this message because you are subscribed to the Google Groups
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.