Hello,
I'm no lxml expert, so it could be a newbie error…but the following web
scrawler script sometimes breaks (see "BUG") while trying to find the number of
provinces/properties, even after two one-second sleeps:
==========
import requests
from lxml import html
import re
import math
import time
def grab_properties():
properties = soup.xpath("//div[contains(@class,'gallery')]/a/@href")
for property in properties:
print(property)
response = requests.get(property)
coords = pattern_coords.search(response.text) #raw HTML since
data in JSON
if coords:
lat,lon=coords.group(1),coords.group(2)
print(f"{lat}\t{lon}")
pattern_count = re.compile("(\d+) Propertie") #ignore trailing s for
singular/plural
pattern_coords = re.compile("latitude:(.+?),longitude:([^}]+)") #JSON
provinces = ["a", "b", "c"]
for province in provinces:
time.sleep(1) #added but still no cigar
url = f"https://www.acme.com/{province}/"
print("======== ",url)
response = requests.get(url)
soup = html.fromstring(response.text)
#BUG time-out?
count = soup.xpath("//div[contains(@class,'properties-count')]/text()")
print(count)
count = pattern_count.search(count[0])
if count:
print("Number of locations:",count)
locations =
soup.xpath("//div[contains(@class,'other-location-box')]/a/@href")
for location in locations:
time.sleep(1) #added but still no cigar
print(location)
response = requests.get(location)
soup = html.fromstring(response.text)
#BUG time-out?
count =
soup.xpath("//div[contains(@class,'properties-count')]/text()")
print(count)
count = pattern_count.search(count[0])
if not count:
print("Number of properties not found")
break #next location
else:
print("Number of properties found",count.group(1))
#grab what's in current, first page
grab_properties()
#If > 30, must update URL and loop through pages by
groups of 30
count = int(count.group(1))
for index in range (2,math.ceil(count/30)+1):
time.sleep(1) #added but still no cigar
url = f"{location}p/{index}/" #new URL
response = requests.get(url)
soup = html.fromstring(response.text)
grab_properties()
==========
Am I using the wrong syntax to grab the numbers?
Thank you.
_______________________________________________
lxml - The Python XML Toolkit mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/lxml.python.org/
Member address: [email protected]