Source code for nltk.ccg.lexicon

# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals

import re
from collections import defaultdict

from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
from nltk.compat import python_2_unicode_compatible

#------------
# Regular expressions used for parsing components of the lexicon
#------------

# Parses a primitive category and subscripts
rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')

# Separates the next primitive category from the remainder of the
# string
reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')

# Separates the next application operator from the remainder
reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')

# Parses the definition of the category of either a word or a family
reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')

# Strips comments from a line
reComm = re.compile('''([^#]*)(?:#.*)?''')

#----------
# Lexicons
#----------
@python_2_unicode_compatible
[docs]class CCGLexicon(object): ''' Class representing a lexicon for CCG grammars. primitives - The list of primitive categories for the lexicon families - Families of categories entries - A mapping of words to possible categories ''' def __init__(self,start,primitives,families,entries): self._start = PrimitiveCategory(start) self._primitives = primitives self._families = families self._entries = entries # Returns all the possible categories for a word
[docs] def categories(self,word): return self._entries[word] # Returns the target category for the parser
[docs] def start(self): return self._start # String representation of the lexicon # Used for debugging
def __str__(self): st = "" first = True for ident in self._entries: if not first: st = st + "\n" st = st + ident + " => " first = True for cat in self._entries[ident]: if not first: st = st + " | " else: first = False st = st + "%s" % cat return st #----------- # Parsing lexicons #----------- # Separates the contents matching the first set of brackets # from the rest of the input.
[docs]def matchBrackets(string): rest = string[1:] inside = "(" while rest != "" and not rest.startswith(')'): if rest.startswith('('): (part,rest) = matchBrackets(rest) inside = inside + part else: inside = inside + rest[0] rest = rest[1:] if rest.startswith(')'): return (inside + ')',rest[1:]) raise AssertionError('Unmatched bracket in string \'' + string + '\'') # Separates the string for the next portion of the category # from the rest of the string
[docs]def nextCategory(string): if string.startswith('('): return matchBrackets(string) return reNextPrim.match(string).groups() # Parses an application operator
[docs]def parseApplication(app): return Direction(app[0],app[1:]) # Parses the subscripts for a primitive category
[docs]def parseSubscripts(subscr): if subscr: return subscr[1:-1].split(',') return [] # Parse a primitive category
[docs]def parsePrimitiveCategory(chunks,primitives,families,var): # If the primitive is the special category 'var', # replace it with the correct CCGVar if chunks[0] == "var": if chunks[1] is None: if var is None: var = CCGVar() return (var,var) catstr = chunks[0] if catstr in families: (cat, cvar) = families[catstr] if var is None: var = cvar else: cat = cat.substitute([(cvar,var)]) return (cat,var) if catstr in primitives: subscrs = parseSubscripts(chunks[1]) return (PrimitiveCategory(catstr,subscrs),var) raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.') # parseCategory drops the 'var' from the tuple
[docs]def parseCategory(line,primitives,families): return augParseCategory(line,primitives,families)[0] # Parses a string representing a category, and returns # a tuple with (possibly) the CCG variable for the category
[docs]def augParseCategory(line,primitives,families,var = None): (str,rest) = nextCategory(line) if str.startswith('('): (res,var) = augParseCategory(str[1:-1],primitives,families,var) else: # print rePrim.match(str).groups() (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var) while rest != "": app = reApp.match(rest).groups() dir = parseApplication(app[0:3]) rest = app[3] (str,rest) = nextCategory(rest) if str.startswith('('): (arg,var) = augParseCategory(str[1:-1],primitives,families,var) else: (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var) res = FunctionalCategory(res,arg,dir) return (res,var) # Takes an input string, and converts it into a lexicon for CCGs.
[docs]def parseLexicon(lex_str): primitives = [] families = {} entries = defaultdict(list) for line in lex_str.splitlines(): # Strip comments and leading/trailing whitespace. line = reComm.match(line).groups()[0].strip() if line == "": continue if line.startswith(':-'): # A line of primitive categories. # The first line is the target category # ie, :- S, N, NP, VP primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ] else: # Either a family definition, or a word definition (ident, sep, catstr) = reLex.match(line).groups() (cat,var) = augParseCategory(catstr,primitives,families) if sep == '::': # Family definition # ie, Det :: NP/N families[ident] = (cat,var) else: # Word definition # ie, which => (N\N)/(S/NP) entries[ident].append(cat) return CCGLexicon(primitives[0],primitives,families,entries)
openccg_tinytiny = parseLexicon(''' # Rather minimal lexicon based on the openccg `tinytiny' grammar. # Only incorporates a subset of the morphological subcategories, however. :- S,NP,N # Primitive categories Det :: NP/N # Determiners Pro :: NP IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) IntransVpl :: S\\NP[pl] # Plural TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) TransVpl :: S\\NP[pl]/NP # Plural the => NP[sg]/N[sg] the => NP[pl]/N[pl] I => Pro me => Pro we => Pro us => Pro book => N[sg] books => N[pl] peach => N[sg] peaches => N[pl] policeman => N[sg] policemen => N[pl] boy => N[sg] boys => N[pl] sleep => IntransVsg sleep => IntransVpl eat => IntransVpl eat => TransVpl eats => IntransVsg eats => TransVsg see => TransVpl sees => TransVsg ''')