#parse.py
#Lyric parsing module (Leoslyrics, Astraweb, Lyrc)
#Code from Rhythmbox and Amarok sources
#Released under GPL
#Sirio Bolanos Puchet. 2007

import urllib
import urllib2
import re
import rb
from xml.dom import minidom

class Parser (object):

    def __init__(self,artist,title):
        self.title = title
        self.artist = artist       
         
    def return_lyric(self):
        
        leoslyrics_parser = Leoslyrics_parser(self.artist,self.title)
        lyrc_parser = Lyrc_parser(self.artist,self.title)
        astraweb_parser = Astraweb_parser(self.artist,self.title)
        
        leoslyrics_lyric = leoslyrics_parser.parse_results()      
        if leoslyrics_lyric:
            return leoslyrics_lyric

        lyrc_lyric = lyrc_parser.parse_results()
        if lyrc_lyric:
            return lyrc_lyric

        astraweb_lyric = astraweb_parser.parse_results()
        if astraweb_lyric:
            return astraweb_lyric
        return "No suitable lyric found."
            
class Lyrc_parser (object):

    def __init__(self,artist,title):
        self.artist = artist
        self.title = title
    
    def parse_lyrics(self,lyrics):
    
        if re.search('<p><hr',lyrics):
            lyrics = re.split('<p><hr',lyrics,1)[0]
        else:
            lyrics = re.split('<br><br>',lyrics,1)[0]
        
        lyrics = re.sub('<[fF][oO][nN][tT][^>]*>','',lyrics)
        title = re.split('(<[bB]>)([^<]*)',lyrics)[2]
        artist = re.split('(<[uU]>)([^<]*)',lyrics)[2]
        lyrics = re.sub('<[bB]>[^<].*<\/[tT][aA][bB][lL][eE]>','',lyrics)
        lyrics = re.sub('<[Bb][Rr][^>]*>','\n',lyrics)
        titl = "%s - %s\n\n" % (artist,title)
        lyrics = titl + lyrics
        lyrics += "\n\nLyrics provided by lyrc.com.ar"

        return lyrics

    def parse_results(self):

        path = 'http://www.lyrc.com.ar/en/'

        wartist = urllib.quote(self.artist)
        wtitle = urllib.quote(self.title)
        wurl = 'tema1en.php?artist=%s&songname=%s' % (wartist,wtitle)
        
        try:
            url_handle = urllib2.urlopen(path+wurl)
        except:
            sys.exit("No response from server")
        
        lyrics = url_handle.read()
        
        EXPS = ['\n','\r','<[iI][mM][gG][^>]*>','<[aA][^>]*>[^<]*<\/[aA]>','<[sS][cC][rR][iI][pP][tT][^>]*>[^<]*(<!--[^>]*>)*[^<]*<\/[sS][cC][rR][iI][pP][tT]>','<[sS][tT][yY][lL][eE][^>]*>[^<]*(<!--[^>]*>)*[^<]*<\/[sS][tT][yY][lL][eE]>']

        for exp in EXPS:
            p = re.compile(exp)
            lyrics = p.sub('',lyrics)

        separator = re.compile("<[fF][oO][nN][tT][ ]*[sS][iI][zZ][eE][ ]*='2'[ ]*>")

        lyricIndex = separator.search(lyrics)

        if lyricIndex is not None:
            return self.parse_lyrics(separator.split(lyrics,1)[1])
        else:
            return False


class Astraweb_parser (object):

    def __init__(self,artist,title):
        self.artist = artist
        self.title = title
        
    def parse_results(self):

        wartist = re.sub('%20','+',urllib.quote(self.artist))
        wtitle = re.sub('%20','+',urllib.quote(self.title))

        wurl = 'http://search.lyrics.astraweb.com/?word=%s+%s' % (wartist,wtitle)

        try:
            url_handle = urllib2.urlopen(wurl)
        except:
            return False

        results = re.sub('\n','',re.sub('\r','',url_handle.read()))

        if re.search('(<tr><td bgcolor="#BBBBBB".*)(More Songs &gt)',results) is not None:
            body = re.split('(<tr><td bgcolor="#BBBBBB".*)(More Songs &gt)',results)[1]
            entries = re.split('<tr><td bgcolor="#BBBBBB"',body)
            entries.pop(0)
            for entry in entries:
                url = re.split('(\/display[^"]*)',entry)[1]
                artist = re.split('(Artist:.*html">)([^<]*)',entry)[2]
                title = re.split('(\/display[^>]*)([^<]*)',entry)[2][1:]
                            
                if not ((re.search(self.title.lower().strip(),title.lower().strip()) is None)):
                    if not (re.search(self.artist.lower().strip(),artist.lower().strip()) is None):
                        return self.parse_lyrics(url)
                    continue
                continue

            return False

        return False

    def parse_lyrics(self,url):
        
        path = 'http://display.lyrics.astraweb.com'
        
        try:
            url_handle = urllib2.urlopen(path+url)
        except:
            return False
        
        result = re.sub('\n','',re.sub('\r','',url_handle.read()))
       
        artist_title = re.split('(<title>Lyrics: )([^<]*)',result)[2]
        artist = artist_title.split( " - " )[0]
        title  = artist_title.split( " - " )[1]
        
        title = "%s - %s\n\n" % (artist,title)
        lyrics = re.split('(<font face=arial size=2>)(.*)(<\/font><br></td><td*)',result)[2]
        lyrics = title + lyrics
        lyrics = re.sub('<[Bb][Rr][^>]*>','\n',lyrics)
        lyrics += "\n\nLyrics provided by lyrics.astraweb.com"
        
        return lyrics

class Leoslyrics_parser(object):

    def __init__(self,artist,title):
        self.artist = artist
        self.title = title
    
    def parse_results(self):
    
        artist = urllib.quote(self.artist)
        title = urllib.quote(self.title)

        htstring = 'http://api.leoslyrics.com/api_search.php?auth=Rhythmbox&artist=%s&songtitle=%s' % (artist,title)
            
        try:
            url_handle = urllib2.urlopen(htstring)
            data = url_handle.read()
        except:
            return False

        try:
            xmldoc = minidom.parseString(data).documentElement
        except:
            return False

        result_code = xmldoc.getElementsByTagName('response')[0].getAttribute('code')
        if result_code != '0':
            xmldoc.unlink()
            return False
        
        matches = xmldoc.getElementsByTagName('result')[:10]
        
        i = 0
        for match in matches:
            title = match.getElementsByTagName('title')[0].firstChild.data
            artist = match.getElementsByTagName('name')[0].firstChild.data
            
            if (re.search(self.title.lower().strip(),title.lower().strip()) and
                    re.search(self.artist.lower().strip(),artist.lower().strip())):
                continue

            matches = matches[i:]
            i += 1
        
        hids = map(lambda x: x.getAttribute('hid'), matches)

        if len(hids) == 0:
            xmldoc.unlink()
            return False

        xmldoc.unlink()
        
        try:
            lurl = "http://api.leoslyrics.com/api_lyrics.php?auth=Rhythmbox&hid=%s" % (urllib.quote(hids[0].encode('utf-8')))
            reslt = urllib2.urlopen(lurl).read()
            return self.parse_lyrics(reslt)
        except:
            return False


    def parse_lyrics(self, data):
        
        if data is None:
            return False

        try:
            xmldoc = minidom.parseString(data).documentElement
        except:
            return False

        text = xmldoc.getElementsByTagName('title')[0].firstChild.nodeValue
        text += ' - ' + xmldoc.getElementsByTagName('artist')[0].getElementsByTagName('name')[0].firstChild.nodeValue + '\n\n'
        text += xmldoc.getElementsByTagName('text')[0].firstChild.nodeValue
        xmldoc.unlink()

        text += "\n\nLyrics provided by leoslyrics.com"

        return text
