Hi friends,
Please, examine attached script. I want fetch some data from online
resource and almost achieve this, but I can't fetch some weird formatted
data like this '45° Reverse Calf Press'. I got the following error:
45
Reverse Calf Press
Reverse Calf Raise
Seated Reverse Calf Press
Traceback (most recent call last):
File "net_exrx.py", line 226, in ?
exercisesList = LoadExercisesList(2, 4, 9)
File "net_exrx.py", line 86, in LoadExercisesList
return parser.GetExercisesList()
File "net_exrx.py", line 176, in GetExercisesList
self.exerList.append([self.desc[i],self.urls[i]])
IndexError: list index out of range
Thanks in advance!
# -*- coding: utf-8 -*-
import urllib
import string
import re
from sgmllib import SGMLParser
urlList = ['http://www.exrx.net/Lists/ExList/NeckWt.html',
'http://www.exrx.net/Lists/ExList/ShouldWt.html',
'http://www.exrx.net/Lists/ExList/ArmWt.html',
'http://www.exrx.net/Lists/ExList/ForeArmWt.html',
'http://www.exrx.net/Lists/ExList/BackWt.html',
'http://www.exrx.net/Lists/ExList/ChestWt.html',
'http://www.exrx.net/Lists/ExList/WaistWt.html',
'http://www.exrx.net/Lists/ExList/HipsWt.html',
'http://www.exrx.net/Lists/ExList/ThighWt.html',
'http://www.exrx.net/Lists/ExList/CalfWt.html']
neck = [[0, 0, 1, 0, 1, 0, 0, 1],
[0, 0, 1, 0, 1, 0, 0, 1]]
shoulders = [[0, 1, 1, 1, 1, 0, 1, 0],
[0, 1, 1, 1, 1, 0, 1, 0],
[0, 1, 1, 1, 1, 0, 1, 0],
[0, 0, 1, 1, 0, 0, 0, 0]]
upper_arms = [[1, 1, 1, 1, 1, 0, 1, 1],
[0, 1, 1, 1, 1, 0, 0, 0],
[0, 1, 1, 1, 1, 0, 0, 0]]
forearms = [[0, 1, 1, 1, 1, 0, 0, 0],
[0, 1, 1, 1, 1, 0, 0, 0],
[0, 1, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 1, 1, 0, 0, 0],
[0, 0, 0, 1, 1, 0, 0, 0]]
back = [[0, 1, 1, 1, 1, 0, 1, 1],
[1, 1, 1, 0, 1, 0, 0, 1],
[0, 1, 1, 1, 1, 1, 1, 0],
[0, 0, 1, 1, 1, 0, 0, 0],
[0, 0, 1, 1, 1, 0, 0, 0]]
chest = [[1, 1, 1, 1, 1, 0, 1, 1],
[0, 1, 1, 1, 1, 0, 1, 0],
[1, 0, 1, 0, 1, 0, 0, 1],
[0, 1, 1, 1, 1, 0, 1, 0]]
waist = [[0, 0, 1, 0, 1, 0, 0, 1],
[0, 0, 1, 1, 1, 0, 0, 1],
[0, 1, 0, 1, 1, 0, 1, 1]]
hips = [[0, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 1, 0, 1, 0, 0, 0],
[0, 0, 1, 0, 1, 0, 0, 1]]
thighs = [[0, 1, 1, 1, 1, 1, 1, 1],
[0, 1, 1, 1, 1, 0, 1, 1]]
calves = [[0, 1, 1, 1, 1, 1, 1, 1],
[0, 1, 0, 0, 1, 0, 1, 0],
[0, 1, 1, 1, 1, 1, 1, 0]]
exrxList = [neck, shoulders, upper_arms, forearms, back,
chest, waist, hips, thighs, calves]
def LoadExercisesList(muscles, equipment, url):
equipmentList = ['Assisted', 'Barbell', 'Cable', 'Dumbbell', 'Lever',
'Sled', 'Smith', 'Weighted']
try:
usock = urllib.urlopen(urlList[url])
except IOError:
return []
# set up the muscles variable
muscles += 1
# workarounds for exercises on some muscles group
# for 'Back' muscles
if url == 4 and muscles > 3:
muscles += 3
parser = ExercisesListURL(muscles=muscles,
equipment=equipmentList[equipment])
parser.feed(usock.read())
usock.close()
parser.close()
return parser.GetExercisesList()
class ExercisesListURL(SGMLParser):
"""
Class for collecting descriptions and urls for exercise for
selected muscles group
"""
def __init__(self, verbose=0, muscles=1, equipment='assisted'):
SGMLParser.__init__(self, verbose)
self.urls = []
self.desc = []
self.insideA = False
self.insideTD = False
self.insideLI = False
self.insideUL = False
self.insideSubUL = False
self.numOfTD = (muscles * 2) - 1
self.tables = 1
self.exercises = False
self.equipment = string.lower(equipment)
self.exerGroup = ""
self.prefix = ''
def start_td(self, attrs):
if self.tables == self.numOfTD:
self.insideTD = True
self.tables += 1
def start_li(self, attrs):
if self.insideTD == True:
self.insideLI = True
def start_ul(self, attrs):
if self.insideTD == True and self.insideLI == True:
if self.insideUL == True:
self.insideSubUL = True
self.insideUL = True
def start_a(self, attrs):
"""Process a hyperlink and its attributes"""
if self.insideTD == True:
for name, value in attrs:
if string.find(value,'WeightExercise') != -1 and \
string.find(value, '#') == -1:
self.exercises = True
else:
self.exercises = False
if name == 'href' and self.exercises == True and \
self.exerGroup == self.equipment:
self.urls.append(value)
self.insideA = True
def end_a(self):
"""Record the and of hyperlink"""
self.insideA = False
def end_ul(self):
if self.insideSubUL == True:
self.insideSubUL = False
else:
self.insideUL = False
def end_td(self):
self.insideTD = False
def handle_data(self, data):
"""Handle the textual 'data'"""
if self.insideA == True and self.exerGroup == self.equipment:
# print unicode(data)
# # replace some weird chars
# p = re.compile('\x9c')
# data = p.sub('-degree', data)
tmpData = string.join(string.split(data), ' ')
if self.insideUL == True and self.insideSubUL == False:
self.prefix = tmpData
elif self.insideSubUL == True:
tmpData = self.prefix + ' ' + tmpData
self.desc.append(tmpData)
if self.insideLI == True and self.insideA != True and \
self.insideTD == True and self.insideUL == False:
tmpData = string.split(data, " ")
if tmpData[0] != '\r\n':
self.exerGroup = string.lower(string.strip(tmpData[0]))
def GetExercisesList(self):
self.exerList = []
for i in xrange(0, len(self.desc)):
self.exerList.append([self.desc[i],self.urls[i]])
return self.exerList
def LoadExerciseInfo(url):
usock = urllib.urlopen(url)
htmlData = usock.read()
parser = ImageURL()
parser.feed(htmlData)
usock.close()
parser.close()
# Extract exercise title
p = re.search('<TITLE>(.+)</TITLE>', htmlData, re.M | re.I)
exerciseTitle = p.group(1)
# Extract exercise's classification
p = re.compile('\d">(.+)</A></TD></TR>', re.M | re.I)
exerciseClass = p.findall(htmlData)
# Extrace image url
exerciseImageURL = parser.GetImageURL()
# Extract instruction
p = re.search('<H2>Instructions</H2>(.+)<H2>Comments</H2>', htmlData,
re.S | re.M | re.I)
instructionHTML = string.strip(p.group(1))
# Extract comments
p = re.search('<H2>Comments</H2>(.+?)</TD>', htmlData,
re.I | re.S | re.M)
commentsHTML = string.strip(p.group(1))
# Extract muscles info
p = re.search('<H2>Muscles</H2>(.+)</TD>', htmlData,
re.S | re.M | re.I)
musclesHTML = string.strip(p.group(1))
class ImageURL(SGMLParser):
"""Class for extracting url for animated image"""
def __init__(self,verbose=0):
SGMLParser.__init__(self, verbose)
self.url = ''
def start_img(self, attrs):
for name, value in attrs:
if name == 'src':
self.url = value
def GetImageURL(self):
return self.url
if __name__ == '__main__':
exercisesList = LoadExercisesList(2, 4, 9)
for i in exercisesList:
print i
_______________________________________________
Tutor maillist - Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor