[Tutor] SGMLLib, fetching some weird data

Basil Shubin Sat, 12 Aug 2006 04:47:35 -0700

Hi friends,

Please, examine attached script. I want fetch some data from onlineresource and almost achieve this, but I can't fetch some weird formatteddata like this '45° Reverse Calf Press'. I got the following error:


45
 Reverse Calf Press
Reverse Calf Raise
Seated Reverse Calf Press
Traceback (most recent call last):
  File "net_exrx.py", line 226, in ?
    exercisesList = LoadExercisesList(2, 4, 9)
  File "net_exrx.py", line 86, in LoadExercisesList
    return parser.GetExercisesList()
  File "net_exrx.py", line 176, in GetExercisesList
    self.exerList.append([self.desc[i],self.urls[i]])
IndexError: list index out of range

Thanks in advance!

# -*- coding: utf-8 -*-

import urllib
import string
import re
from sgmllib import SGMLParser

urlList = ['http://www.exrx.net/Lists/ExList/NeckWt.html',
           'http://www.exrx.net/Lists/ExList/ShouldWt.html',
           'http://www.exrx.net/Lists/ExList/ArmWt.html',
           'http://www.exrx.net/Lists/ExList/ForeArmWt.html',
           'http://www.exrx.net/Lists/ExList/BackWt.html',
           'http://www.exrx.net/Lists/ExList/ChestWt.html',
           'http://www.exrx.net/Lists/ExList/WaistWt.html',
           'http://www.exrx.net/Lists/ExList/HipsWt.html',
           'http://www.exrx.net/Lists/ExList/ThighWt.html',
           'http://www.exrx.net/Lists/ExList/CalfWt.html']

neck = [[0, 0, 1, 0, 1, 0, 0, 1],
        [0, 0, 1, 0, 1, 0, 0, 1]]

shoulders = [[0, 1, 1, 1, 1, 0, 1, 0],
             [0, 1, 1, 1, 1, 0, 1, 0],
             [0, 1, 1, 1, 1, 0, 1, 0],
             [0, 0, 1, 1, 0, 0, 0, 0]]

upper_arms = [[1, 1, 1, 1, 1, 0, 1, 1],
              [0, 1, 1, 1, 1, 0, 0, 0], 
              [0, 1, 1, 1, 1, 0, 0, 0]]

forearms = [[0, 1, 1, 1, 1, 0, 0, 0],
            [0, 1, 1, 1, 1, 0, 0, 0], 
            [0, 1, 1, 1, 1, 0, 0, 0],  
            [0, 0, 0, 1, 1, 0, 0, 0], 
            [0, 0, 0, 1, 1, 0, 0, 0]]

back = [[0, 1, 1, 1, 1, 0, 1, 1],
        [1, 1, 1, 0, 1, 0, 0, 1], 
        [0, 1, 1, 1, 1, 1, 1, 0],  
        [0, 0, 1, 1, 1, 0, 0, 0], 
        [0, 0, 1, 1, 1, 0, 0, 0]]

chest = [[1, 1, 1, 1, 1, 0, 1, 1],
         [0, 1, 1, 1, 1, 0, 1, 0],  
         [1, 0, 1, 0, 1, 0, 0, 1], 
         [0, 1, 1, 1, 1, 0, 1, 0]]


waist = [[0, 0, 1, 0, 1, 0, 0, 1],
         [0, 0, 1, 1, 1, 0, 0, 1], 
         [0, 1, 0, 1, 1, 0, 1, 1]]

hips = [[0, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 0, 1, 0, 0, 0], 
        [0, 0, 1, 0, 1, 0, 0, 1]]

thighs = [[0, 1, 1, 1, 1, 1, 1, 1],
          [0, 1, 1, 1, 1, 0, 1, 1]]

calves = [[0, 1, 1, 1, 1, 1, 1, 1],
          [0, 1, 0, 0, 1, 0, 1, 0], 
          [0, 1, 1, 1, 1, 1, 1, 0]]

exrxList = [neck, shoulders, upper_arms, forearms, back,
            chest, waist, hips, thighs, calves]


def LoadExercisesList(muscles, equipment, url):
    equipmentList = ['Assisted', 'Barbell', 'Cable', 'Dumbbell', 'Lever',
                      'Sled', 'Smith', 'Weighted']
    try:
        usock = urllib.urlopen(urlList[url])
    except IOError:
        return []
    # set up the muscles variable
    muscles += 1
    # workarounds for exercises on some muscles group
    # for 'Back' muscles
    if url == 4 and muscles > 3:
        muscles += 3
    parser = ExercisesListURL(muscles=muscles,
                              equipment=equipmentList[equipment])
    parser.feed(usock.read())
    usock.close()
    parser.close()
    return parser.GetExercisesList()
        

class ExercisesListURL(SGMLParser):
    """
    Class for collecting descriptions and urls for exercise for
    selected muscles group
    """

    def __init__(self, verbose=0, muscles=1, equipment='assisted'):
        SGMLParser.__init__(self, verbose)

        self.urls = []
        self.desc = []
        self.insideA = False
        self.insideTD = False
        self.insideLI = False
        self.insideUL = False
        self.insideSubUL = False
        self.numOfTD = (muscles * 2) - 1
        self.tables = 1
        self.exercises = False
        self.equipment = string.lower(equipment)
        self.exerGroup = ""
        self.prefix = ''

    def start_td(self, attrs):
        if self.tables == self.numOfTD:
            self.insideTD = True
        self.tables += 1

    def start_li(self, attrs):
        if self.insideTD == True:
            self.insideLI = True

    def start_ul(self, attrs):
        if self.insideTD == True and self.insideLI == True:
            if self.insideUL == True:
                self.insideSubUL = True
            self.insideUL = True
        
    def start_a(self, attrs):
        """Process a hyperlink and its attributes"""
        if self.insideTD == True:
            for name, value in attrs:
                if string.find(value,'WeightExercise') != -1 and \
                       string.find(value, '#') == -1:
                    self.exercises = True
                else:
                    self.exercises = False
                if name == 'href' and self.exercises == True and \
                       self.exerGroup == self.equipment:
                    self.urls.append(value)
                    self.insideA = True

    def end_a(self):
        """Record the and of hyperlink"""
        self.insideA = False

    def end_ul(self):
        if self.insideSubUL == True:
            self.insideSubUL = False
        else:
            self.insideUL = False

    def end_td(self):
        self.insideTD = False

    def handle_data(self, data):
        """Handle the textual 'data'"""
        if self.insideA == True and self.exerGroup == self.equipment:
#             print unicode(data)
#             # replace some weird chars
#             p = re.compile('\x9c')
#             data = p.sub('-degree', data)
            tmpData = string.join(string.split(data), ' ')
            if self.insideUL == True and self.insideSubUL == False:
                self.prefix = tmpData
            elif self.insideSubUL == True:
                tmpData = self.prefix + ' ' + tmpData
            self.desc.append(tmpData)
        if self.insideLI == True and self.insideA != True and \
               self.insideTD == True and self.insideUL == False:
            tmpData = string.split(data, " ")
            if tmpData[0] != '\r\n':
                self.exerGroup = string.lower(string.strip(tmpData[0]))

    def GetExercisesList(self):
        self.exerList = []
        for i in xrange(0, len(self.desc)):
            self.exerList.append([self.desc[i],self.urls[i]])
        return self.exerList


def LoadExerciseInfo(url):
    usock = urllib.urlopen(url)
    htmlData = usock.read()
    parser = ImageURL()
    parser.feed(htmlData)
    usock.close()
    parser.close()
    # Extract exercise title
    p = re.search('<TITLE>(.+)</TITLE>', htmlData, re.M | re.I)
    exerciseTitle = p.group(1)
    # Extract exercise's classification
    p = re.compile('\d">(.+)</A></TD></TR>', re.M | re.I)
    exerciseClass = p.findall(htmlData)
    # Extrace image url
    exerciseImageURL = parser.GetImageURL()
    # Extract instruction
    p = re.search('<H2>Instructions</H2>(.+)<H2>Comments</H2>', htmlData,
                  re.S | re.M | re.I)
    instructionHTML = string.strip(p.group(1))
    # Extract comments
    p = re.search('<H2>Comments</H2>(.+?)</TD>', htmlData,
                  re.I | re.S | re.M)
    commentsHTML = string.strip(p.group(1))
    # Extract muscles info
    p = re.search('<H2>Muscles</H2>(.+)</TD>', htmlData,
                  re.S | re.M | re.I)
    musclesHTML = string.strip(p.group(1))

  
class ImageURL(SGMLParser):
    """Class for extracting url for animated image"""

    def __init__(self,verbose=0):
        SGMLParser.__init__(self, verbose)

        self.url = ''

    def start_img(self, attrs):
        for name, value in attrs:
            if name == 'src':
                self.url = value

    def GetImageURL(self):
        return self.url

if __name__ == '__main__':
    exercisesList = LoadExercisesList(2, 4, 9)
    for i in exercisesList:
        print i

_______________________________________________
Tutor maillist  -  Tutor@python.org
http://mail.python.org/mailman/listinfo/tutor

[Tutor] SGMLLib, fetching some weird data

Reply via email to