On Friday 06 August 2010 14:06:55 Sylvain Raybaud wrote: > * moses cannot translate "real world" graph generated with this tool. > See for example: http://perso.crans.org/raybaud/realworld.slf.gz
got it: sphinx hypothesis contains a quote: nous incombe d' assurer which ends up like this in plf: ('d'',x,y) see example7 attached (with quote) and example7.cleaned (no quote): the former gives me the same error, the later is translated just fine. how are we supposed to deal with quotes in PLF? regards, -- Sylvain
#!/usr/bin/env python # -*- coding: utf-8 -*- # # # Copyright (C) 2010, Sylvain Raybaud <srayb...@crans.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Sylvain Raybaud nor the names of its contributors may # be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Sylvain Raybaud <srayb...@crans.org> ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL Sylvain Raybaud BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # This piece of software turns a word lattice in SLF (HTK format) and converts it # to PLF (Moses input format) # SLF is documented there: http://labrosa.ee.columbia.edu/doc/HTKBook21/node257.html # PLF is documented there: http://www.statmt.org/moses/?n=Moses.WordLattices # I tried to keep most of the flexibility of SLF # # Usage: slf2plf [options] input_file > output # or: slf2plf [options] input_file.gz > output # or: slf2plf [options] < input_file > output # Options: # -C, --dont-clean: don't remove noise information from input lattice # -h, --help: print short help # # Principle: # 1) load graph into memory as a list of nodes and a list of links # each node has access to the list of its incoming and outgoing # links # 2) Compute a topological ordering of nodes # 3) Output lattice in moses input format # # The algorithm implemented for topological sort is the one described here on the 6 august 2010: # http://en.wikipedia.org/wiki/Topological_sorting # # TODO: # a) Make the program robust to missing accoustic and lm scores in the lattice # b) Make it possible to choose how output lattice score is computed from nodes and link attributes # c) Deal with sublattices # d) Make the program robust to incorrect input USAGE = """ %prog [options] input_file > output or: %prog [options] input_file.gz > output or: %prog [options] < input_file > output input_file is a graph in SLF format (HTK), possibly gziped (.gz extension required).""" from optparse import OptionParser import gzip import sys import re import copy #general purpose tools to make your life easier def open_plain_or_gz(filename,mode='r'): """Transparently open gziped or plain files, based on file extension""" return gzip.open(filename,mode) if filename.endswith(".gz") else open(filename,mode) def verbose_msg(opts,msg): if opts.verbose: sys.stderr.write(msg+'\n') def warn(msg): """Print a warning message to stderr. The program continues.""" sys.stderr.write("*** WARNING *** %s\n" % msg) def error(msg,code=1): """Print an error message to stderr and exists with the given code (default: 1).""" sys.stderr.write("*** ERROR *** %s\n" % msg) exit(code) # regular expressions that will be used later __attr_re__ = r"(?P<attribute>[^=]+)=(?P<quote>(\"|'')?)(?P<value>.*)(?P=quote)" # filed_name=field_value value may be "quoted" __size_re__ = r"(?P<attribute>N|NODES|L|LINKS)=(?P<value>\d+)" # for parsing size information in SLF __node_re__ = r"I=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__ # for parsing node definition in SLF __link_re__ = r"J=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__ # for parsing link information in SLF __noise_re__ = r"\+\+[^\+]*\+\+" # a few mandatory attributes in short and long forms. They will be stored in long form ATTRIBUTES = { 'E': 'END', 'S': 'START', 'W': 'WORD', 'a': 'acoustic', 'l': 'language' } def __get_long_form__(string): if ATTRIBUTES.has_key(string): return ATTRIBUTES[string] else: return string # remove comments and blank lines from input def __clean__(lines): data = [] for l in lines: if l.strip()!='' and l[0]!='#': data.append(re.sub("#.*\n",'',l)) return data # print a link out of the first node in the list 'remains' def __print_links_out__(lattice,remains): n = lattice.nodes[remains[0]] sys.stdout.write("(") for l in n.outgoing: link = lattice.links[l] distance = remains.index(link.to_node) # this is the distance between current node and the node it is linked to a = float(link.attributes['acoustic']) l = float(link.attributes['language']) sys.stdout.write("('%s',%f,%i)," % (link.attributes['WORD'],a+l*lattice.lmscale,distance)) sys.stdout.write("),") class Node: """ Node definition for using in word lattice """ def __init__(self,index,attributes_string=None): """index is just the id of the node. attribute_string is a succession fo definition of the form: name=value""" self.index = index self.incoming = set() self.outgoing = set() self.attributes = {} if attributes_string: attributes = attributes_string.split() for attr_str in attributes: m = re.match(__attr_re__,attr_str) attr = __get_long_form__(m.group('attribute')) val = m.group('value') if self.attributes.has_key(attr): warn("Node %i: attribute '%s' redefined." % (self.index,attr)) self.attributes[attr] = val class Link: """ Link definition for using in word lattice """ def __init__(self,index,attributes_string=None): """index is just the id of the link. attribute_string is a succession fo definition of the form: name=value""" self.index = index self.from_node = -1 self.to_node = -1 self.word = "" self.attributes = {} if attributes_string: attributes = attributes_string.split() for attr_str in attributes: m = re.match(__attr_re__,attr_str) attr = __get_long_form__(m.group('attribute')) val = m.group('value') if self.attributes.has_key(attr): warn("Node %i: attribute '%s' redefined." % (self.index,attr)) self.attributes[attr] = val try: self.from_node = int(self.attributes['START']) self.to_node = int(self.attributes['END']) except: error("Incomplete definition of link %i" % self.index) #TODO: check that word attribute is defined either here or on destination node class Lattice: """ Class representing a word lattice with constructor and method for loading a lattice in HTK Standard Lattice Format. Also has a method for writing the lattice in Moses input format (PLF) """ def __init__(self,lattfile=None,lines=None,clean=True): # can read lattice either from file or data in memory """ lattfile should be an open stream. lines should contain a reversed list of already read lines. At least one of them must be provided. Option 'clean' control weither or not noise information should be removes from the lattice. """ data = [] if lattfile==None and (lines==None or len(lines)==0): error("Where shall I read the lattice from?") if lattfile!=None: data = lattfile.readlines() #not very memory efficient, but easier for parsing data.reverse() # lines to be parsed will be stored on a stack if lines!=None: data = data+lines data = __clean__(data) #remove comments and empty lines self.nodes_count = -1 self.links_count = -1 self.nodes = {} self.links = {} self.sublattices = {} self.lmscale = 1.0 self.attributes = {} self.__load_graph__(data,clean) def __load_graph__(self,lines,clean): self.__parse_header__(lines) self.__parse_counts__(lines) self.__parse_nodes__(lines) removed_links = self.__parse_links__(lines,clean) removed_nodes = 0 if clean: removed_nodes = self.remove_orphan_nodes() if len(self.nodes)+removed_nodes!=self.nodes_count or len(self.links)+removed_links!=self.links_count: warn("Incorrect nodes or links count") if self.attributes.has_key('lmscale'): self.lmscale = float(self.attributes['lmscale']) def __parse_header__(self,lines): attributes_tmp = {} go = True # carry on as long as we are parsing a header while len(lines)>0 and go: l = lines.pop() m = re.match(__attr_re__,l) if m and not re.match(__size_re__+".*",l): # should match an attribute but *not* a size definition attr = m.group('attribute') val = m.group('value') if attributes_tmp.has_key(attr): warn("Attribute '%s' redefined." % attr) attributes_tmp[attr] = val if attr=="SUBLAT": # we are parsing a sublattice. go down, create it and then carry on. sublat = Lattice(lines=lines) # merge attributes for k in attributes_tmp: if sublat.attributes.has_key(k): warn("Sublattice '%s': attribute '%s' redefined." % (val,k)) sublat.attributes.update(attributes_tmp) attributes_tmp = {} self.sublattices[val] = sublat if lines.pop().strip()!=":": warn("Missing colon at the end of sublattice '%s'. Continuing anyway, but things could go wrong." % val) else: go = False if not go: lines.append(l) self.attributes = attributes_tmp # If I understand correctly counts may be written in any order, on 1 or 2 lines # Therefore parsed lines need to be split def __parse_counts__(self,lines): go = True while len(lines)>0 and go: l = lines.pop() terms = l.split() while len(terms)>0 and go: t = terms.pop() m = re.match(__size_re__,t) if m: if m.group('attribute')=='N' or m.group('attribute')=='NODES': if self.nodes_count!=-1: warn("Nodes count redefined.") self.nodes_count = int(m.group('value')) elif m.group('attribute')=='L' or m.group('attribute')=='LINKS': if self.links_count!=-1: warn("Links count redefined.") self.links_count = int(m.group('value')) else: go = False if not go: lines.append(l) def __parse_nodes__(self,lines): go = True while len(lines)>0 and go: l = lines.pop() m = re.match(__node_re__,l) if m: index = int(m.group('id')) n = Node(index,m.group('attributes')) if self.nodes.has_key(index): warn("Node %i redefined." % index) self.nodes[index] = n else: go = False if not go: lines.append(l) def __parse_links__(self,lines,clean): removed_links = 0 go = True while len(lines)>0 and go: l = lines.pop() m = re.match(__link_re__,l) if m: index = int(m.group('id')) if self.links.has_key(index): warn("Link %i redefined." % index) e = Link(index,m.group('attributes')) # if link is not noise, or if no cleaning required, insert link if (not clean) or (not 'WORD' in e.attributes) or (not re.match(__noise_re__,e.attributes['WORD'])): self.__update_nodes__(e) # Information about nodes is contained in link definition self.links[index] = e else: removed_links += 1 else: go = False if not go: lines.append(l) return removed_links def __update_nodes__(self,link): # in case the nodes were not explicitely defined if not self.nodes.has_key(link.from_node): self.nodes[link.from_node] = Node(link.from_node) if not self.nodes.has_key(link.to_node): self.nodes[link.to_node] = Node(link.to_node) # update connection information of nodes self.nodes[link.from_node].outgoing.add(link.index) self.nodes[link.to_node].incoming.add(link.index) def remove_orphan_nodes(self): """ Put orphaned nodes out of their misery """ removed_nodes = 0 for n in self.nodes.values(): if len(n.incoming)==0 and len(n.outgoing)==0: self.nodes.pop(n.index) removed_nodes += 1 return removed_nodes # write lattice in PLF def write_plf(self): """ Write lattice in moses input format (PLF) on standard output. """ # get topological ordering ordered_nodes = topological_ordering(self) # NOT THAT EASY # ordered nodes of a sublattice can be inserted at the position of the node they replace #for i,n in len(ordered_nodes): #if self.nodes[n].has_key('L'): # this node is in fact a sublattice #sublat_ordered = self.sublattices # write lattice sys.stdout.write("(") for i,n in enumerate(ordered_nodes[:-1]): # list all nodes in topological order __print_links_out__(self,ordered_nodes[i:]) # and write links leaving them sys.stdout.write(")\n") return # topological sorting algorithm needs to be initialised with a list of initial nodes having no incoming link def get_starting_nodes(lattice): """ Given an instance of class Lattice, output the list of all nodes with no incoming link """ S = [] for n in lattice.nodes.values(): if len(n.incoming)==0: S.append(n.index) return S def topological_ordering(original_lattice): """ Given an instance of class Lattice, output a topological ordering of nodes (as a list of indexes) """ lattice = copy.deepcopy(original_lattice) # the lattice will be modified by the program. Work on a copy. start_nodes = get_starting_nodes(lattice) ordered_nodes = [] while len(start_nodes)>0: n = start_nodes.pop() ordered_nodes.append(n) for l in set(lattice.nodes[n].outgoing): m = lattice.nodes[lattice.links[l].to_node] lattice.links.pop(l) m.incoming.remove(l) lattice.nodes[n].outgoing.remove(l) if len(m.incoming)==0: start_nodes.append(m.index) if len(lattice.links)>0: warn("Lattice has at least one cycle.") return ordered_nodes if __name__=='__main__': parser = OptionParser(usage=USAGE) parser.add_option('-C','--dont-clean',default=True,action="store_false",dest="clean",help="Don't clean input graph: word lattice graph generated by sphinx contain information about noise which should be removed for translation. Use this option if you want to keep it. [clean]") parser.add_option('-v','--verbose',default=False,action="store_true",dest="verbose",help="verbosity [%default]") opts,args = parser.parse_args() if len(args)>1: parser.print_help() error("bad command line.",1) input_file = sys.stdin if (len(args)==0 or args[0]=="-") else open_plain_or_gz(args[0]) lattice = Lattice(input_file,clean=opts.clean) lattice.write_plf() if not (len(args)==0 or args[0]=="-"): input_file.close()
VERSION=1.1 UTTERANCE=1 lmscale=1 wdpenalty=0.0 # # Size line # N=5 L=5 # # Node definitions # I=0 I=1 I=2 I=3 I=4 # # Link definitions. # J=0 S=0 E=1 W=nous a=0 l=0 J=1 S=1 E=2 W=incombe a=0 l=0 J=2 S=2 E=3 W=d' a=0 l=0 J=3 S=3 E=4 W=assurer a=0 l=0 J=4 S=0 E=2 W=incombe a=0 l=0
((('nous',0.000000,1),('incombe',0.000000,2),),(('incombe',0.000000,1),),(('d'',0.000000,1),),(('assurer',0.000000,1),),)
((('nous',0.000000,1),('incombe',0.000000,2),),(('incombe',0.000000,1),),(('d',0.000000,1),),(('assurer',0.000000,1),),)
_______________________________________________ Moses-support mailing list Moses-support@mit.edu http://mailman.mit.edu/mailman/listinfo/moses-support