Yep that works, I promise next time I'll think before I post :) Thank you! see new script attached.
On Friday 06 August 2010 14:54:41 Jie Jiang wrote: > add a \ before that quote > > Best regards! > > Jie Jiang > CNGL, School of Computing, > Dublin City University, > Glasnevin, Dublin 9. > Tel: +353 (0)1 700 6724 > > > > > 2010/8/6 Sylvain Raybaud <sylvain.rayb...@loria.fr> > > > On Friday 06 August 2010 14:06:55 Sylvain Raybaud wrote: > > > * moses cannot translate "real world" graph generated with this tool. > > > > > > See for example: http://perso.crans.org/raybaud/realworld.slf.gz > > > > got it: sphinx hypothesis contains a quote: > > > > nous incombe d' assurer > > > > which ends up like this in plf: > > > > ('d'',x,y) > > > > see example7 attached (with quote) and example7.cleaned (no quote): the > > former > > gives me the same error, the later is translated just fine. > > > > > > how are we supposed to deal with quotes in PLF? > > > > regards, > > > > -- > > Sylvain > > > > _______________________________________________ > > Moses-support mailing list > > Moses-support@mit.edu > > http://mailman.mit.edu/mailman/listinfo/moses-support -- Sylvain
#!/usr/bin/env python # -*- coding: utf-8 -*- # # # Copyright (C) 2010, Sylvain Raybaud <srayb...@crans.org> # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Sylvain Raybaud nor the names of its contributors may # be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Sylvain Raybaud <srayb...@crans.org> ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL Sylvain Raybaud BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # This piece of software turns a word lattice in SLF (HTK format) and converts it # to PLF (Moses input format) # SLF is documented there: http://labrosa.ee.columbia.edu/doc/HTKBook21/node257.html # PLF is documented there: http://www.statmt.org/moses/?n=Moses.WordLattices # I tried to keep most of the flexibility of SLF # # Usage: slf2plf [options] input_file > output # or: slf2plf [options] input_file.gz > output # or: slf2plf [options] < input_file > output # Options: # -C, --dont-clean: don't remove noise information from input lattice # -R, --dont-replace: don't use default substitution list. Be careful (see help) # -h, --help: print short help # # Principle: # 1) load graph into memory as a list of nodes and a list of links # each node has access to the list of its incoming and outgoing # links # 2) Compute a topological ordering of nodes # 3) Output lattice in moses input format # # The algorithm implemented for topological sort is the one described here on the 6 august 2010: # http://en.wikipedia.org/wiki/Topological_sorting # # TODO: # a) Make the program robust to missing accoustic and lm scores in the lattice # b) Make it possible to choose how output lattice score is computed from nodes and link attributes # c) Deal with sublattices # d) Make the program robust to incorrect input USAGE = """ %prog [options] input_file > output or: %prog [options] input_file.gz > output or: %prog [options] < input_file > output input_file is a graph in SLF format (HTK), possibly gziped (.gz extension required).""" from optparse import OptionParser import gzip import sys import re import copy #general purpose tools to make your life easier def open_plain_or_gz(filename,mode='r'): """Transparently open gziped or plain files, based on file extension""" return gzip.open(filename,mode) if filename.endswith(".gz") else open(filename,mode) def verbose_msg(opts,msg): if opts.verbose: sys.stderr.write(msg+'\n') def warn(msg): """Print a warning message to stderr. The program continues.""" sys.stderr.write("*** WARNING *** %s\n" % msg) def error(msg,code=1): """Print an error message to stderr and exists with the given code (default: 1).""" sys.stderr.write("*** ERROR *** %s\n" % msg) exit(code) # regular expressions that will be used later __attr_re__ = r"(?P<attribute>[^=]+)=(?P<quote>(\"|'')?)(?P<value>.*)(?P=quote)" # filed_name=field_value value may be "quoted" __size_re__ = r"(?P<attribute>N|NODES|L|LINKS)=(?P<value>\d+)" # for parsing size information in SLF __node_re__ = r"I=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__ # for parsing node definition in SLF __link_re__ = r"J=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__ # for parsing link information in SLF __noise_re__ = r"\+\+[^\+]*\+\+" # links to be cleaned out if requested CLEAN = ['<sil>'] # word replacement list REPLACE = {"d'": r"d\'"} def __replace_word__(string): if REPLACE.has_key(string): return REPLACE[string] else: return string # a few mandatory attributes in short and long forms. They will be stored in long form ATTRIBUTES = { 'E': 'END', 'S': 'START', 'W': 'WORD', 'a': 'acoustic', 'l': 'language' } def __get_long_form__(string): if ATTRIBUTES.has_key(string): return ATTRIBUTES[string] else: return string # remove comments and blank lines from input def __clean__(lines): data = [] for l in lines: if l.strip()!='' and l[0]!='#': data.append(re.sub("#.*\n",'',l)) return data # print a link out of the first node in the list 'remains' def __print_links_out__(lattice,remains): n = lattice.nodes[remains[0]] sys.stdout.write("(") for l in n.outgoing: link = lattice.links[l] distance = remains.index(link.to_node) # this is the distance between current node and the node it is linked to a = float(link.attributes['acoustic']) l = float(link.attributes['language']) sys.stdout.write("('%s',%f,%i)," % (link.attributes['WORD'],a+l*lattice.lmscale,distance)) sys.stdout.write("),") class Node: """ Node definition for using in word lattice """ def __init__(self,index,attributes_string=None): """index is just the id of the node. attribute_string is a succession fo definition of the form: name=value""" self.index = index self.incoming = set() self.outgoing = set() self.attributes = {} if attributes_string: attributes = attributes_string.split() for attr_str in attributes: m = re.match(__attr_re__,attr_str) attr = __get_long_form__(m.group('attribute')) val = m.group('value') if self.attributes.has_key(attr): warn("Node %i: attribute '%s' redefined." % (self.index,attr)) self.attributes[attr] = val class Link: """ Link definition for using in word lattice """ def __init__(self,index,attributes_string=None,replace=True): """index is just the id of the link. attribute_string is a succession fo definition of the form: name=value""" self.index = index self.from_node = -1 self.to_node = -1 self.word = "" self.attributes = {} if attributes_string: attributes = attributes_string.split() for attr_str in attributes: m = re.match(__attr_re__,attr_str) attr = __get_long_form__(m.group('attribute')) val = m.group('value') if self.attributes.has_key(attr): warn("Node %i: attribute '%s' redefined." % (self.index,attr)) if replace and attr=='WORD': val = __replace_word__(val) self.attributes[attr] = val try: self.from_node = int(self.attributes['START']) self.to_node = int(self.attributes['END']) except: error("Incomplete definition of link %i" % self.index) #TODO: check that word attribute is defined either here or on destination node class Lattice: """ Class representing a word lattice with constructor and method for loading a lattice in HTK Standard Lattice Format. Also has a method for writing the lattice in Moses input format (PLF) """ def __init__(self,lattfile=None,lines=None,clean=True,replace=True): # can read lattice either from file or data in memory """ lattfile should be an open stream. lines should contain a reversed list of already read lines. At least one of them must be provided. Option 'clean' controls weither or not noise information should be removes from the lattice. Option 'replace' controls weither or not some words will be replaced by other (for sanitizing input, e.g. quotes). """ data = [] if lattfile==None and (lines==None or len(lines)==0): error("Where shall I read the lattice from?") if lattfile!=None: data = lattfile.readlines() #not very memory efficient, but easier for parsing data.reverse() # lines to be parsed will be stored on a stack if lines!=None: data = data+lines data = __clean__(data) #remove comments and empty lines self.nodes_count = -1 self.links_count = -1 self.nodes = {} self.links = {} self.sublattices = {} self.lmscale = 1.0 self.attributes = {} self.__load_graph__(data,clean,replace) def __load_graph__(self,lines,clean,replace): self.__parse_header__(lines) self.__parse_counts__(lines) self.__parse_nodes__(lines) removed_links = self.__parse_links__(lines,clean,replace) removed_nodes = 0 if clean: removed_nodes = self.remove_orphan_nodes() if len(self.nodes)+removed_nodes!=self.nodes_count or len(self.links)+removed_links!=self.links_count: warn("Incorrect nodes or links count") if self.attributes.has_key('lmscale'): self.lmscale = float(self.attributes['lmscale']) def __parse_header__(self,lines): attributes_tmp = {} go = True # carry on as long as we are parsing a header while len(lines)>0 and go: l = lines.pop() m = re.match(__attr_re__,l) if m and not re.match(__size_re__+".*",l): # should match an attribute but *not* a size definition attr = m.group('attribute') val = m.group('value') if attributes_tmp.has_key(attr): warn("Attribute '%s' redefined." % attr) attributes_tmp[attr] = val if attr=="SUBLAT": # we are parsing a sublattice. go down, create it and then carry on. sublat = Lattice(lines=lines) # merge attributes for k in attributes_tmp: if sublat.attributes.has_key(k): warn("Sublattice '%s': attribute '%s' redefined." % (val,k)) sublat.attributes.update(attributes_tmp) attributes_tmp = {} self.sublattices[val] = sublat if lines.pop().strip()!=":": warn("Missing colon at the end of sublattice '%s'. Continuing anyway, but things could go wrong." % val) else: go = False if not go: lines.append(l) self.attributes = attributes_tmp # If I understand correctly counts may be written in any order, on 1 or 2 lines # Therefore parsed lines need to be split def __parse_counts__(self,lines): go = True while len(lines)>0 and go: l = lines.pop() terms = l.split() while len(terms)>0 and go: t = terms.pop() m = re.match(__size_re__,t) if m: if m.group('attribute')=='N' or m.group('attribute')=='NODES': if self.nodes_count!=-1: warn("Nodes count redefined.") self.nodes_count = int(m.group('value')) elif m.group('attribute')=='L' or m.group('attribute')=='LINKS': if self.links_count!=-1: warn("Links count redefined.") self.links_count = int(m.group('value')) else: go = False if not go: lines.append(l) def __parse_nodes__(self,lines): go = True while len(lines)>0 and go: l = lines.pop() m = re.match(__node_re__,l) if m: index = int(m.group('id')) n = Node(index,m.group('attributes')) if self.nodes.has_key(index): warn("Node %i redefined." % index) self.nodes[index] = n else: go = False if not go: lines.append(l) def __parse_links__(self,lines,clean,replace): removed_links = 0 go = True while len(lines)>0 and go: l = lines.pop() m = re.match(__link_re__,l) if m: index = int(m.group('id')) if self.links.has_key(index): warn("Link %i redefined." % index) e = Link(index,m.group('attributes'),replace) # if link is not noise, or if no cleaning required, insert link if (not clean) or (not 'WORD' in e.attributes) or ((not e.attributes['WORD'] in CLEAN) and (not re.match(__noise_re__,e.attributes['WORD']))): self.__update_nodes__(e) # Information about nodes is contained in link definition self.links[index] = e else: removed_links += 1 else: go = False if not go: lines.append(l) return removed_links def __update_nodes__(self,link): # in case the nodes were not explicitely defined if not self.nodes.has_key(link.from_node): self.nodes[link.from_node] = Node(link.from_node) if not self.nodes.has_key(link.to_node): self.nodes[link.to_node] = Node(link.to_node) # update connection information of nodes self.nodes[link.from_node].outgoing.add(link.index) self.nodes[link.to_node].incoming.add(link.index) def remove_orphan_nodes(self): """ Put orphaned nodes out of their misery """ removed_nodes = 0 for n in self.nodes.values(): if len(n.incoming)==0 and len(n.outgoing)==0: self.nodes.pop(n.index) removed_nodes += 1 return removed_nodes # write lattice in PLF def write_plf(self): """ Write lattice in moses input format (PLF) on standard output. """ # get topological ordering ordered_nodes = topological_ordering(self) # NOT THAT EASY # ordered nodes of a sublattice can be inserted at the position of the node they replace #for i,n in len(ordered_nodes): #if self.nodes[n].has_key('L'): # this node is in fact a sublattice #sublat_ordered = self.sublattices # write lattice sys.stdout.write("(") for i,n in enumerate(ordered_nodes[:-1]): # list all nodes in topological order __print_links_out__(self,ordered_nodes[i:]) # and write links leaving them sys.stdout.write(")\n") return # topological sorting algorithm needs to be initialised with a list of initial nodes having no incoming link def get_starting_nodes(lattice): """ Given an instance of class Lattice, output the list of all nodes with no incoming link """ S = [] for n in lattice.nodes.values(): if len(n.incoming)==0: S.append(n.index) return S def topological_ordering(original_lattice): """ Given an instance of class Lattice, output a topological ordering of nodes (as a list of indexes) """ lattice = copy.deepcopy(original_lattice) # the lattice will be modified by the program. Work on a copy. start_nodes = get_starting_nodes(lattice) ordered_nodes = [] while len(start_nodes)>0: n = start_nodes.pop() ordered_nodes.append(n) for l in set(lattice.nodes[n].outgoing): m = lattice.nodes[lattice.links[l].to_node] lattice.links.pop(l) m.incoming.remove(l) lattice.nodes[n].outgoing.remove(l) if len(m.incoming)==0: start_nodes.append(m.index) if len(lattice.links)>0: warn("Lattice has at least one cycle.") return ordered_nodes if __name__=='__main__': parser = OptionParser(usage=USAGE) parser.add_option('-C','--dont-clean',default=True,action="store_false",dest="clean",help="Don't clean input graph: word lattice graph generated by sphinx contain information about noise which should be removed for translation. Use this option if you want to keep it. [clean]") parser.add_option('-R','--dont-replace',default=True,action="store_false",dest="replace",help="Don't use default substitution list. Be careful! for example if there is a quote in your graph moses won't be able to decode it. [replace]") parser.add_option('-v','--verbose',default=False,action="store_true",dest="verbose",help="verbosity [%default]") opts,args = parser.parse_args() if len(args)>1: parser.print_help() error("bad command line.",1) input_file = sys.stdin if (len(args)==0 or args[0]=="-") else open_plain_or_gz(args[0]) lattice = Lattice(input_file,clean=opts.clean,replace=opts.replace) lattice.write_plf() if not (len(args)==0 or args[0]=="-"): input_file.close()
_______________________________________________ Moses-support mailing list Moses-support@mit.edu http://mailman.mit.edu/mailman/listinfo/moses-support