Source code for nltk.sem.glue

# Natural Language Toolkit: Glue Semantics
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# Copyright (C) 2001-2013 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, division, unicode_literals

import os

import nltk
from nltk.internals import Counter
from nltk.compat import string_types
from nltk.corpus import brown
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
from nltk.sem.logic import (LogicParser, Expression, Variable, VariableExpression,
                            LambdaExpression, AbstractVariableExpression)
from nltk.compat import python_2_unicode_compatible
from nltk.sem import drt
from nltk.sem import linearlogic

SPEC_SEMTYPES = {'a'       : 'ex_quant',
                 'an'      : 'ex_quant',
                 'every'   : 'univ_quant',
                 'the'     : 'def_art',
                 'no'      : 'no_quant',
                 'default' : 'ex_quant'}

OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct']

@python_2_unicode_compatible
[docs]class GlueFormula(object):
    def __init__(self, meaning, glue, indices=None):
        if not indices:
            indices = set()

        if isinstance(meaning, string_types):
            self.meaning = LogicParser().parse(meaning)
        elif isinstance(meaning, Expression):
            self.meaning = meaning
        else:
            raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))

        if isinstance(glue, string_types):
            self.glue = linearlogic.LinearLogicParser().parse(glue)
        elif isinstance(glue, linearlogic.Expression):
            self.glue = glue
        else:
            raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__))

        self.indices = indices

[docs]    def applyto(self, arg):
        """ self = (\\x.(walk x), (subj -o f))
            arg  = (john        ,  subj)
            returns ((walk john),          f)
        """
        if self.indices & arg.indices: # if the sets are NOT disjoint
            raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'.  Indices are not disjoint." % (self, arg))
        else: # if the sets ARE disjoint
            return_indices = (self.indices | arg.indices)

        try:
            return_glue = linearlogic.ApplicationExpression(self.glue, arg.glue, arg.indices)
        except linearlogic.LinearLogicApplicationException:
            raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'" % (self.simplify(), arg.simplify()))

        arg_meaning_abstracted = arg.meaning
        if return_indices:
            for dep in self.glue.simplify().antecedent.dependencies[::-1]: # if self.glue is (A -o B), dep is in A.dependencies
                arg_meaning_abstracted = self.make_LambdaExpression(Variable('v%s' % dep),
                                                                    arg_meaning_abstracted)
        return_meaning = self.meaning.applyto(arg_meaning_abstracted)

        return self.__class__(return_meaning, return_glue, return_indices)

[docs]    def make_VariableExpression(self, name):
        return VariableExpression(name)

[docs]    def make_LambdaExpression(self, variable, term):
        return LambdaExpression(variable, term)

[docs]    def lambda_abstract(self, other):
        assert isinstance(other, GlueFormula)
        assert isinstance(other.meaning, AbstractVariableExpression)
        return self.__class__(self.make_LambdaExpression(other.meaning.variable,
                                                         self.meaning),
                              linearlogic.ImpExpression(other.glue, self.glue))

[docs]    def compile(self, counter=None):
        """From Iddo Lev's PhD Dissertation p108-109"""
        if not counter:
            counter = Counter()
        (compiled_glue, new_forms) = self.glue.simplify().compile_pos(counter, self.__class__)
        return new_forms + [self.__class__(self.meaning, compiled_glue, set([counter.get()]))]

[docs]    def simplify(self):
        return self.__class__(self.meaning.simplify(), self.glue.simplify(), self.indices)

    def __eq__(self, other):
        return self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue

    def __ne__(self, other):
        return not self == other

    def __str__(self):
        assert isinstance(self.indices, set)
        accum = '%s : %s' % (self.meaning, self.glue)
        if self.indices:
            accum += ' : {' + ', '.join(str(index) for index in self.indices) + '}'
        return accum

    def __repr__(self):
        return "%s" % self

@python_2_unicode_compatible
[docs]class GlueDict(dict):
    def __init__(self, filename, encoding=None):
        self.filename = filename
        self.file_encoding = encoding
        self.read_file()

[docs]    def read_file(self, empty_first=True):
        if empty_first:
            self.clear()

        try:
            contents = nltk.data.load(self.filename, format='text', encoding=self.file_encoding)
            # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load()
        except LookupError as e:
            try:
                contents = nltk.data.load('file:' + self.filename, format='text', encoding=self.file_encoding)
            except LookupError:
                raise e
        lines = contents.splitlines()

        for line in lines:                          # example: 'n : (\\x.(<word> x), (v-or))'
                                                    #     lambdacalc -^  linear logic -^
            line = line.strip()                     # remove trailing newline
            if not len(line): continue              # skip empty lines
            if line[0] == '#': continue             # skip commented out lines

            parts = line.split(' : ', 2)            # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']

            glue_formulas = []
            parenCount = 0
            tuple_start = 0
            tuple_comma = 0

            relationships = None

            if len(parts) > 1:
                for (i, c) in enumerate(parts[1]):
                    if c == '(':
                        if parenCount == 0:             # if it's the first '(' of a tuple
                            tuple_start = i+1           # then save the index
                        parenCount += 1
                    elif c == ')':
                        parenCount -= 1
                        if parenCount == 0:             # if it's the last ')' of a tuple
                            meaning_term =  parts[1][tuple_start:tuple_comma]   # '\\x.(<word> x)'
                            glue_term =     parts[1][tuple_comma+1:i]           # '(v-r)'
                            glue_formulas.append([meaning_term, glue_term])     # add the GlueFormula to the list
                    elif c == ',':
                        if parenCount == 1:             # if it's a comma separating the parts of the tuple
                            tuple_comma = i             # then save the index
                    elif c == '#':                      # skip comments at the ends of lines
                        if parenCount != 0:             # if the line hasn't parsed correctly so far
                            raise RuntimeError('Formula syntax is incorrect for entry ' + line)
                        break                           # break to the next line

            if len(parts) > 2:                      #if there is a relationship entry at the end
                relStart = parts[2].index('[')+1
                relEnd   = parts[2].index(']')
                if relStart == relEnd:
                    relationships = frozenset()
                else:
                    relationships = frozenset(r.strip() for r in parts[2][relStart:relEnd].split(','))

            try:
                startInheritance = parts[0].index('(')
                endInheritance = parts[0].index(')')
                sem = parts[0][:startInheritance].strip()
                supertype = parts[0][startInheritance+1:endInheritance]
            except:
                sem = parts[0].strip()
                supertype = None

            if sem not in self:
                self[sem] = {}

            if relationships is None: #if not specified for a specific relationship set
                #add all relationship entries for parents
                if supertype:
                    for rels in self[supertype]:
                        if rels not in self[sem]:
                            self[sem][rels] = []
                        glue = self[supertype][rels]
                        self[sem][rels].extend(glue)
                        self[sem][rels].extend(glue_formulas) # add the glue formulas to every rel entry
                else:
                    if None not in self[sem]:
                        self[sem][None] = []
                    self[sem][None].extend(glue_formulas) # add the glue formulas to every rel entry
            else:
                if relationships not in self[sem]:
                    self[sem][relationships] = []
                if supertype:
                    self[sem][relationships].extend(self[supertype][relationships])
                self[sem][relationships].extend(glue_formulas) # add the glue entry to the dictionary


    def __str__(self):
        accum = ''
        for pos in self:
            str_pos = "%s" % pos
            for relset in self[pos]:
                i = 1
                for gf in self[pos][relset]:
                    if i==1:
                        accum += str_pos + ': '
                    else:
                        accum += ' '*(len(str_pos)+2)
                    accum += "%s" % gf
                    if relset and i==len(self[pos][relset]):
                        accum += ' : %s' % relset
                    accum += '\n'
                    i += 1
        return accum

[docs]    def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
        if node is None:
            top = depgraph.nodelist[0]
            root = depgraph.nodelist[top['deps'][0]]
            return self.to_glueformula_list(depgraph, root, Counter(), verbose)

        glueformulas = self.lookup(node, depgraph, counter)
        for dep_idx in node['deps']:
            dep = depgraph.nodelist[dep_idx]
            glueformulas.extend(self.to_glueformula_list(depgraph, dep, counter, verbose))
        return glueformulas

[docs]    def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
#            raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError("There is no GlueDict entry for sem type of '%s'"\
                    " with tag '%s', and rel '%s'" %\
                    (node['word'], node['tag'], node['rel']))

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)

[docs]    def add_missing_dependencies(self, node, depgraph):
        rel = node['rel'].lower()

        if rel == 'main':
            headnode = depgraph.nodelist[node['head']]
            subj = self.lookup_unique('subj', headnode, depgraph)
            node['deps'].append(subj['address'])

    def _lookup_semtype_option(self, semtype, node, depgraph):
        relationships = frozenset(depgraph.nodelist[dep]['rel'].lower()
                                   for dep in node['deps']
                                   if depgraph.nodelist[dep]['rel'].lower()
                                       not in OPTIONAL_RELATIONSHIPS)

        try:
            lookup = semtype[relationships]
        except KeyError:
            # An exact match is not found, so find the best match where
            # 'best' is defined as the glue entry whose relationship set has the
            # most relations of any possible relationship set that is a subset
            # of the actual depgraph
            best_match = frozenset()
            for relset_option in set(semtype)-set([None]):
                if len(relset_option) > len(best_match) and \
                   relset_option < relationships:
                    best_match = relset_option
            if not best_match:
                if None in semtype:
                    best_match = None
                else:
                    return None
            lookup = semtype[best_match]

        return lookup

[docs]    def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        semtype_name = None

        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']]

[docs]    def get_glueformulas_from_semtype_entry(self, lookup, word, node, depgraph, counter):
        glueformulas = []

        glueFormulaFactory = self.get_GlueFormula_factory()
        for meaning, glue in lookup:
            gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
            if not len(glueformulas):
                gf.word = word
            else:
                gf.word = '%s%s' % (word, len(glueformulas)+1)

            gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())

            glueformulas.append(gf)
        return glueformulas

[docs]    def get_meaning_formula(self, generic, word):
        """
        :param generic: A meaning formula string containing the
        parameter "<word>"
        :param word: The actual word to be replace "<word>"
        """
        word = word.replace('.', '')
        return generic.replace('<word>', word)

[docs]    def initialize_labels(self, expr, node, depgraph, unique_index):
        if isinstance(expr, linearlogic.AtomicExpression):
            name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
            if name[0].isupper():
                return linearlogic.VariableExpression(name)
            else:
                return linearlogic.ConstantExpression(name)
        else:
            return linearlogic.ImpExpression(
                       self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
                       self.initialize_labels(expr.consequent, node, depgraph, unique_index))

[docs]    def find_label_name(self, name, node, depgraph, unique_index):
        try:
            dot = name.index('.')

            before_dot = name[:dot]
            after_dot = name[dot+1:]
            if before_dot == 'super':
                return self.find_label_name(after_dot, depgraph.nodelist[node['head']], depgraph, unique_index)
            else:
                return self.find_label_name(after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index)
        except ValueError:
            lbl = self.get_label(node)
            if   name=='f':     return lbl
            elif name=='v':     return '%sv' % lbl
            elif name=='r':     return '%sr' % lbl
            elif name=='super': return self.get_label(depgraph.nodelist[node['head']])
            elif name=='var':   return '%s%s' % (lbl.upper(), unique_index)
            elif name=='a':     return self.get_label(self.lookup_unique('conja', node, depgraph))
            elif name=='b':     return self.get_label(self.lookup_unique('conjb', node, depgraph))
            else:               return self.get_label(self.lookup_unique(name, node, depgraph))

[docs]    def get_label(self, node):
        """
        Pick an alphabetic character as identifier for an entity in the model.

        :param value: where to index into the list of characters
        :type value: int
        """
        value = node['address']

        letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s',
                  't','u','v','w','x','y','z','a','b','c','d','e'][value-1]
        num = int(value) // 26
        if num > 0:
            return letter + str(num)
        else:
            return letter

[docs]    def lookup_unique(self, rel, node, depgraph):
        """
        Lookup 'key'. There should be exactly one item in the associated relation.
        """
        deps = [depgraph.nodelist[dep] for dep in node['deps']
                if depgraph.nodelist[dep]['rel'].lower() == rel.lower()]

        if len(deps) == 0:
            raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel))
        elif len(deps) > 1:
            raise KeyError("'%s' should only have one feature '%s'" % (node['word'], rel))
        else:
            return deps[0]

[docs]    def get_GlueFormula_factory(self):
        return GlueFormula

[docs]class Glue(object):
    def __init__(self, semtype_file=None, remove_duplicates=False,
                 depparser=None, verbose=False):
        self.verbose = verbose
        self.remove_duplicates = remove_duplicates
        self.depparser = depparser

        from nltk import Prover9
        self.prover = Prover9()

        if semtype_file:
            self.semtype_file = semtype_file
        else:
            self.semtype_file = os.path.join('grammars', 'sample_grammars','glue.semtype')

[docs]    def train_depparser(self, depgraphs=None):
        if depgraphs:
            self.depparser.train(depgraphs)
        else:
            self.depparser.train_from_file(nltk.data.find(
                os.path.join('grammars', 'sample_grammars',
                             'glue_train.conll')))

[docs]    def parse_to_meaning(self, sentence):
        readings = []
        for agenda in self.parse_to_compiled(sentence):
            readings.extend(self.get_readings(agenda))
        return readings

[docs]    def get_readings(self, agenda):
        readings = []
        agenda_length = len(agenda)
        atomics = dict()
        nonatomics = dict()
        while agenda: # is not empty
            cur = agenda.pop()
            glue_simp = cur.glue.simplify()
            if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic
                for key in atomics:
                    try:
                        if isinstance(cur.glue, linearlogic.ApplicationExpression):
                            bindings = cur.glue.bindings
                        else:
                            bindings = linearlogic.BindingDict()
                        glue_simp.antecedent.unify(key, bindings)
                        for atomic in atomics[key]:
                            if not (cur.indices & atomic.indices): # if the sets of indices are disjoint
                                try:
                                    agenda.append(cur.applyto(atomic))
                                except linearlogic.LinearLogicApplicationException:
                                    pass
                    except linearlogic.UnificationException:
                        pass
                try:
                    nonatomics[glue_simp.antecedent].append(cur)
                except KeyError:
                    nonatomics[glue_simp.antecedent] = [cur]

            else: # else cur.glue is atomic
                for key in nonatomics:
                    for nonatomic in nonatomics[key]:
                        try:
                            if isinstance(nonatomic.glue, linearlogic.ApplicationExpression):
                                bindings = nonatomic.glue.bindings
                            else:
                                bindings = linearlogic.BindingDict()
                            glue_simp.unify(key, bindings)
                            if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint
                                try:
                                    agenda.append(nonatomic.applyto(cur))
                                except linearlogic.LinearLogicApplicationException:
                                    pass
                        except linearlogic.UnificationException:
                            pass
                try:
                    atomics[glue_simp].append(cur)
                except KeyError:
                    atomics[glue_simp] = [cur]

        for entry in atomics:
            for gf in atomics[entry]:
                if len(gf.indices) == agenda_length:
                    self._add_to_reading_list(gf, readings)
        for entry in nonatomics:
            for gf in nonatomics[entry]:
                if len(gf.indices) == agenda_length:
                    self._add_to_reading_list(gf, readings)
        return readings

    def _add_to_reading_list(self, glueformula, reading_list):
        add_reading = True
        if self.remove_duplicates:
            for reading in reading_list:
                try:
                    if reading.equiv(glueformula.meaning, self.prover):
                        add_reading = False
                        break;
                except Exception as e:
                    #if there is an exception, the syntax of the formula
                    #may not be understandable by the prover, so don't
                    #throw out the reading.
                    print('Error when checking logical equality of statements', e)
                    pass
        if add_reading:
            reading_list.append(glueformula.meaning)

[docs]    def parse_to_compiled(self, sentence='a man sees Mary'.split()):
        gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
        return [self.gfl_to_compiled(gfl) for gfl in gfls]

[docs]    def dep_parse(self, sentence='every cat leaves'.split()):
        #Lazy-initialize the depparser
        if self.depparser is None:
            from nltk.parse import MaltParser
            self.depparser = MaltParser(tagger=self.get_pos_tagger())
        if not self.depparser._trained:
            self.train_depparser()

        return [self.depparser.parse(sentence, verbose=self.verbose)]

[docs]    def depgraph_to_glue(self, depgraph):
        return self.get_glue_dict().to_glueformula_list(depgraph)

[docs]    def get_glue_dict(self):
        return GlueDict(self.semtype_file)

[docs]    def gfl_to_compiled(self, gfl):
        index_counter = Counter()
        return_list = []
        for gf in gfl:
            return_list.extend(gf.compile(index_counter))

        if self.verbose:
            print('Compiled Glue Premises:')
            for cgf in return_list:
                print(cgf)

        return return_list

[docs]    def get_pos_tagger(self):
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger


[docs]class DrtGlueFormula(GlueFormula):
    def __init__(self, meaning, glue, indices=None):
        if not indices:
            indices = set()

        if isinstance(meaning, string_types):
            self.meaning = drt.DrtParser().parse(meaning)
        elif isinstance(meaning, drt.AbstractDrs):
            self.meaning = meaning
        else:
            raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__))

        if isinstance(glue, string_types):
            self.glue = linearlogic.LinearLogicParser().parse(glue)
        elif isinstance(glue, linearlogic.Expression):
            self.glue = glue
        else:
            raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__))

        self.indices = indices

[docs]    def make_VariableExpression(self, name):
        return drt.DrtVariableExpression(name)

[docs]    def make_LambdaExpression(self, variable, term):
        return drt.DrtLambdaExpression(variable, term)

[docs]class DrtGlueDict(GlueDict):
[docs]    def get_GlueFormula_factory(self):
        return DrtGlueFormula

[docs]class DrtGlue(Glue):
    def __init__(self, semtype_file=None, remove_duplicates=False,
                 depparser=None, verbose=False):
        if not semtype_file:
            semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype')
        Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)

[docs]    def get_glue_dict(self):
        return DrtGlueDict(self.semtype_file)


[docs]def demo(show_example=-1):
    from nltk.parse import MaltParser
    examples = ['David sees Mary',
                'David eats a sandwich',
                'every man chases a dog',
                'every man believes a dog sleeps',
                'John gives David a sandwich',
                'John chases himself']
#                'John persuades David to order a pizza',
#                'John tries to go',
#                'John tries to find a unicorn',
#                'John seems to vanish',
#                'a unicorn seems to approach',
#                'every big cat leaves',
#                'every gray cat leaves',
#                'every big gray cat leaves',
#                'a former senator leaves',

    print('============== DEMO ==============')

    tagger = RegexpTagger(
        [('^(David|Mary|John)$', 'NNP'),
         ('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
         ('^(go|order|vanish|find|approach)$', 'VB'),
         ('^(a)$', 'ex_quant'),
         ('^(every)$', 'univ_quant'),
         ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
         ('^(big|gray|former)$', 'JJ'),
         ('^(him|himself)$', 'PRP')
    ])

    depparser = MaltParser(tagger=tagger)
    glue = Glue(depparser=depparser, verbose=False)

    for (i, sentence) in enumerate(examples):
        if i==show_example or show_example==-1:
            print('[[[Example %s]]]  %s' % (i, sentence))
            for reading in glue.parse_to_meaning(sentence.split()):
                print(reading.simplify())
            print('')


if __name__ == '__main__':
    demo()
Source code for nltk.sem.glue

Table Of Contents

Search