On Friday 06 August 2010 14:06:55 Sylvain Raybaud wrote:
>   * moses cannot translate "real world" graph generated with this tool.
> See for example: http://perso.crans.org/raybaud/realworld.slf.gz

got it: sphinx hypothesis contains a quote:

nous incombe d' assurer

which ends up like this in plf:

('d'',x,y)

see example7 attached (with quote) and example7.cleaned (no quote): the former 
gives me the same error, the later is translated just fine.


how are we supposed to deal with quotes in PLF?

regards,

-- 
Sylvain
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#
# Copyright (C) 2010, Sylvain Raybaud <srayb...@crans.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of Sylvain Raybaud nor the names of its contributors may
#       be used to endorse or promote products derived from this software
#       without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY Sylvain Raybaud <srayb...@crans.org> ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Sylvain Raybaud BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# This piece of software turns a word lattice in SLF (HTK format) and converts it
# to PLF (Moses input format)
# SLF is documented there: http://labrosa.ee.columbia.edu/doc/HTKBook21/node257.html
# PLF is documented there: http://www.statmt.org/moses/?n=Moses.WordLattices
# I tried to keep most of the flexibility of SLF
#
# Usage:  slf2plf [options] input_file > output
# or:     slf2plf [options] input_file.gz > output
# or:     slf2plf [options] < input_file > output
# Options:
#    -C, --dont-clean: don't remove noise information from input lattice
#    -h, --help: print short help
#
# Principle:
#  1) load graph into memory as a list of nodes and a list of links
#     each node has access to the list of its incoming and outgoing
#     links
#  2) Compute a topological ordering of nodes
#  3) Output lattice in moses input format
#
# The algorithm implemented for topological sort is the one described here on the 6 august 2010:
#   http://en.wikipedia.org/wiki/Topological_sorting 
#
# TODO:
#  a) Make the program robust to missing accoustic and lm scores in the lattice
#  b) Make it possible to choose how output lattice score is computed from nodes and link attributes
#  c) Deal with sublattices
#  d) Make the program robust to incorrect input


USAGE = """
    %prog [options] input_file > output
or: %prog [options] input_file.gz > output
or: %prog [options] < input_file > output
input_file is a graph in SLF format (HTK), possibly gziped (.gz extension required)."""

from optparse import OptionParser
import gzip
import sys
import re
import copy


#general purpose tools to make your life easier

def open_plain_or_gz(filename,mode='r'):
    """Transparently open gziped or plain files, based on file extension"""
    return gzip.open(filename,mode) if filename.endswith(".gz") else open(filename,mode)

def verbose_msg(opts,msg):
    if opts.verbose:
        sys.stderr.write(msg+'\n')

def warn(msg):
    """Print a warning message to stderr. The program continues."""
    sys.stderr.write("*** WARNING *** %s\n" % msg)
    
def error(msg,code=1):
    """Print an error message to stderr and exists with the given code (default: 1)."""
    sys.stderr.write("*** ERROR *** %s\n" % msg)
    exit(code)


# regular expressions that will be used later

__attr_re__ = r"(?P<attribute>[^=]+)=(?P<quote>(\"|'')?)(?P<value>.*)(?P=quote)"  # filed_name=field_value    value may be "quoted"
__size_re__ = r"(?P<attribute>N|NODES|L|LINKS)=(?P<value>\d+)"  # for parsing size information in SLF
__node_re__ = r"I=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__  # for parsing node definition in SLF
__link_re__ = r"J=(?P<id>\d+)(?P<attributes>(\s+%s)*)" % __attr_re__  # for parsing link information in SLF
__noise_re__ = r"\+\+[^\+]*\+\+"

# a few mandatory attributes in short and long forms. They will be stored in long form

ATTRIBUTES = {
    'E': 'END',
    'S': 'START',
    'W': 'WORD',
    'a': 'acoustic',
    'l': 'language'
    }

def __get_long_form__(string):
    if ATTRIBUTES.has_key(string):
        return ATTRIBUTES[string]
    else:
        return string

# remove comments and blank lines from input
def __clean__(lines):
    data = []
    for l in lines:
        if l.strip()!='' and l[0]!='#':
            data.append(re.sub("#.*\n",'',l))
    return data

# print a link out of the first node in the list 'remains'
def __print_links_out__(lattice,remains):
    n = lattice.nodes[remains[0]]
    sys.stdout.write("(")
    for l in n.outgoing:
        link = lattice.links[l]
        distance = remains.index(link.to_node)  # this is the distance between current node and the node it is linked to
        a = float(link.attributes['acoustic'])
        l = float(link.attributes['language'])
        sys.stdout.write("('%s',%f,%i)," % (link.attributes['WORD'],a+l*lattice.lmscale,distance))
    sys.stdout.write("),")

class Node:
    """ Node definition for using in word lattice """
    def __init__(self,index,attributes_string=None):
        """index is just the id of the node. attribute_string is a succession fo definition of the form: name=value"""
        self.index = index
        self.incoming = set()
        self.outgoing = set()
        self.attributes = {}
        if attributes_string:
            attributes = attributes_string.split()
            for attr_str in attributes:
                m = re.match(__attr_re__,attr_str)
                attr = __get_long_form__(m.group('attribute'))
                val = m.group('value')
                if self.attributes.has_key(attr):
                    warn("Node %i: attribute '%s' redefined." % (self.index,attr))
                self.attributes[attr] = val
                

class Link:
    """ Link definition for using in word lattice """
    def __init__(self,index,attributes_string=None):
        """index is just the id of the link. attribute_string is a succession fo definition of the form: name=value"""
        self.index = index
        self.from_node = -1
        self.to_node = -1
        self.word = ""
        self.attributes = {}
        if attributes_string:
            attributes = attributes_string.split()
            for attr_str in attributes:
                m = re.match(__attr_re__,attr_str)
                attr = __get_long_form__(m.group('attribute'))
                val = m.group('value')
                if self.attributes.has_key(attr):
                    warn("Node %i: attribute '%s' redefined." % (self.index,attr))
                self.attributes[attr] = val
        try:
            self.from_node = int(self.attributes['START'])
            self.to_node = int(self.attributes['END'])
        except:
            error("Incomplete definition of link %i" % self.index)
        #TODO: check that word attribute is defined either here or on destination node

class Lattice:
    """ Class representing a word lattice with constructor and method for loading a lattice in HTK Standard Lattice Format.
    Also has a method for writing the lattice in Moses input format (PLF) """
    def __init__(self,lattfile=None,lines=None,clean=True):  # can read lattice either from file or data in memory
        """ lattfile should be an open stream. lines should contain a reversed list of already read lines. At least one of them must be provided. Option 'clean' control weither or not noise information should be removes from the lattice. """
        data = []
        if lattfile==None and (lines==None or len(lines)==0):
            error("Where shall I read the lattice from?")
        if lattfile!=None:
            data = lattfile.readlines()  #not very memory efficient, but easier for parsing
            data.reverse()  # lines to be parsed will be stored on a stack
        if lines!=None:
            data = data+lines
        data = __clean__(data)  #remove comments and empty lines
        self.nodes_count = -1
        self.links_count = -1
        self.nodes = {}
        self.links = {}
        self.sublattices = {}
        self.lmscale = 1.0
        self.attributes = {}
        self.__load_graph__(data,clean)
    
    def __load_graph__(self,lines,clean):
        self.__parse_header__(lines)
        self.__parse_counts__(lines)
        self.__parse_nodes__(lines)
        removed_links = self.__parse_links__(lines,clean)
        removed_nodes = 0
        if clean:
            removed_nodes = self.remove_orphan_nodes()
        if len(self.nodes)+removed_nodes!=self.nodes_count or len(self.links)+removed_links!=self.links_count:
            warn("Incorrect nodes or links count")
        if self.attributes.has_key('lmscale'):
            self.lmscale = float(self.attributes['lmscale'])
     
    def __parse_header__(self,lines):
        attributes_tmp = {}
        go = True  # carry on as long as we are parsing a header
        while len(lines)>0 and go:
            l = lines.pop()
            m = re.match(__attr_re__,l)
            if m and not re.match(__size_re__+".*",l):  # should match an attribute but *not* a size definition
                attr = m.group('attribute')
                val = m.group('value')
                if attributes_tmp.has_key(attr):
                    warn("Attribute '%s' redefined." % attr)
                attributes_tmp[attr] = val
                if attr=="SUBLAT": # we are parsing a sublattice. go down, create it and then carry on.
                    sublat = Lattice(lines=lines)
                    # merge attributes
                    for k in attributes_tmp:
                        if sublat.attributes.has_key(k):
                            warn("Sublattice '%s': attribute '%s' redefined." % (val,k))
                    sublat.attributes.update(attributes_tmp)
                    attributes_tmp = {}
                    self.sublattices[val] = sublat
                    if lines.pop().strip()!=":":
                        warn("Missing colon at the end of sublattice '%s'. Continuing anyway, but things could go wrong." % val)
            else:
                go = False
        if not go:
            lines.append(l)
        self.attributes = attributes_tmp
    
    # If I understand correctly counts may be written in any order, on 1 or 2 lines
    # Therefore parsed lines need to be split
    def __parse_counts__(self,lines):
        go = True
        while len(lines)>0 and go:
            l = lines.pop()
            terms = l.split()
            while len(terms)>0 and go:
                t = terms.pop()
                m = re.match(__size_re__,t)
                if m:
                    if m.group('attribute')=='N' or m.group('attribute')=='NODES':
                        if self.nodes_count!=-1:
                            warn("Nodes count redefined.")
                        self.nodes_count = int(m.group('value'))
                    elif m.group('attribute')=='L' or m.group('attribute')=='LINKS':
                        if self.links_count!=-1:
                            warn("Links count redefined.")
                        self.links_count = int(m.group('value'))
                else:
                    go = False
        if not go:
            lines.append(l)
        
        
    def __parse_nodes__(self,lines):
        go = True
        while len(lines)>0 and go:
            l = lines.pop()
            m = re.match(__node_re__,l)
            if m:
                index = int(m.group('id'))
                n = Node(index,m.group('attributes'))
                if self.nodes.has_key(index):
                    warn("Node %i redefined." % index)
                self.nodes[index] = n
            else:
                go = False
        if not go:
            lines.append(l)
        
    def __parse_links__(self,lines,clean):
        removed_links = 0
        go = True
        while len(lines)>0 and go:
            l = lines.pop()
            m = re.match(__link_re__,l)
            if m:
                index = int(m.group('id'))
                if self.links.has_key(index):
                    warn("Link %i redefined." % index)
                e = Link(index,m.group('attributes'))
                # if link is not noise, or if no cleaning required, insert link
                if (not clean) or (not 'WORD' in e.attributes) or (not re.match(__noise_re__,e.attributes['WORD'])):
                    self.__update_nodes__(e)  # Information about nodes is contained in link definition
                    self.links[index] = e
                else:
                    removed_links += 1
            else:
                go = False
        if not go:
            lines.append(l)
        return removed_links
    
    def __update_nodes__(self,link):
        # in case the nodes were not explicitely defined
        if not self.nodes.has_key(link.from_node):
            self.nodes[link.from_node] = Node(link.from_node)
        if not self.nodes.has_key(link.to_node):
            self.nodes[link.to_node] = Node(link.to_node)
        # update connection information of nodes
        self.nodes[link.from_node].outgoing.add(link.index)
        self.nodes[link.to_node].incoming.add(link.index)
    
    def remove_orphan_nodes(self):
        """ Put orphaned nodes out of their misery """
        removed_nodes = 0
        for n in self.nodes.values():
            if len(n.incoming)==0 and len(n.outgoing)==0:
                self.nodes.pop(n.index)
                removed_nodes += 1
        return removed_nodes
    
    # write lattice in PLF
    def write_plf(self):
        """ Write lattice in moses input format (PLF) on standard output. """
        # get topological ordering
        ordered_nodes = topological_ordering(self)
        # NOT THAT EASY
        # ordered nodes of a sublattice can be inserted at the position of the node they replace
        #for i,n in len(ordered_nodes):
            #if self.nodes[n].has_key('L'):  # this node is in fact a sublattice
                #sublat_ordered = self.sublattices
        # write lattice
        sys.stdout.write("(")
        for i,n in enumerate(ordered_nodes[:-1]):  # list all nodes in topological order
            __print_links_out__(self,ordered_nodes[i:])  # and write links leaving them
        sys.stdout.write(")\n")
        return

# topological sorting algorithm needs to be initialised with a list of initial nodes having no incoming link
def get_starting_nodes(lattice):
    """ Given an instance of class Lattice, output the list of all nodes with no incoming link """
    S = []
    for n in lattice.nodes.values():
        if len(n.incoming)==0:
            S.append(n.index)
    return S

def topological_ordering(original_lattice):
    """ Given an instance of class Lattice, output a topological ordering of nodes (as a list of indexes) """
    lattice = copy.deepcopy(original_lattice)  # the lattice will be modified by the program. Work on a copy.
    start_nodes = get_starting_nodes(lattice)
    ordered_nodes = []
    while len(start_nodes)>0:
        n = start_nodes.pop()
        ordered_nodes.append(n)
        for l in set(lattice.nodes[n].outgoing):
            m = lattice.nodes[lattice.links[l].to_node]
            lattice.links.pop(l)
            m.incoming.remove(l)
            lattice.nodes[n].outgoing.remove(l)
            if len(m.incoming)==0:
                start_nodes.append(m.index)
    if len(lattice.links)>0:
        warn("Lattice has at least one cycle.")
    return ordered_nodes


if __name__=='__main__':
    parser = OptionParser(usage=USAGE)
    parser.add_option('-C','--dont-clean',default=True,action="store_false",dest="clean",help="Don't clean input graph: word lattice graph generated by sphinx contain information about noise which should be removed for translation. Use this option if you want to keep it. [clean]")
    parser.add_option('-v','--verbose',default=False,action="store_true",dest="verbose",help="verbosity [%default]")
    opts,args = parser.parse_args()
    if len(args)>1:
        parser.print_help()
        error("bad command line.",1)
    input_file = sys.stdin if (len(args)==0 or args[0]=="-") else open_plain_or_gz(args[0])
    lattice = Lattice(input_file,clean=opts.clean)
    lattice.write_plf()
    if not (len(args)==0 or args[0]=="-"):
        input_file.close()
VERSION=1.1
UTTERANCE=1
lmscale=1
wdpenalty=0.0
#
# Size line
#
N=5   L=5
#
# Node definitions
#
I=0
I=1
I=2
I=3
I=4

#
# Link definitions.
#
J=0     S=0    E=1    W=nous    a=0     l=0
J=1     S=1    E=2    W=incombe a=0     l=0
J=2     S=2    E=3    W=d'      a=0     l=0
J=3     S=3    E=4    W=assurer a=0     l=0
J=4     S=0    E=2    W=incombe a=0     l=0
((('nous',0.000000,1),('incombe',0.000000,2),),(('incombe',0.000000,1),),(('d'',0.000000,1),),(('assurer',0.000000,1),),)
((('nous',0.000000,1),('incombe',0.000000,2),),(('incombe',0.000000,1),),(('d',0.000000,1),),(('assurer',0.000000,1),),)
_______________________________________________
Moses-support mailing list
Moses-support@mit.edu
http://mailman.mit.edu/mailman/listinfo/moses-support

Reply via email to