# Natural Language Toolkit: Combinatory Categorial Grammar
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
import re
from collections import defaultdict
from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
from nltk.compat import python_2_unicode_compatible
#------------
# Regular expressions used for parsing components of the lexicon
#------------
# Parses a primitive category and subscripts
rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
# Separates the next primitive category from the remainder of the
# string
reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
# Separates the next application operator from the remainder
reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
# Parses the definition of the category of either a word or a family
reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
# Strips comments from a line
reComm = re.compile('''([^#]*)(?:#.*)?''')
#----------
# Lexicons
#----------
@python_2_unicode_compatible
[docs]class CCGLexicon(object):
'''
Class representing a lexicon for CCG grammars.
primitives - The list of primitive categories for the lexicon
families - Families of categories
entries - A mapping of words to possible categories
'''
def __init__(self,start,primitives,families,entries):
self._start = PrimitiveCategory(start)
self._primitives = primitives
self._families = families
self._entries = entries
# Returns all the possible categories for a word
[docs] def categories(self,word):
return self._entries[word]
# Returns the target category for the parser
[docs] def start(self):
return self._start
# String representation of the lexicon
# Used for debugging
def __str__(self):
st = ""
first = True
for ident in self._entries:
if not first:
st = st + "\n"
st = st + ident + " => "
first = True
for cat in self._entries[ident]:
if not first:
st = st + " | "
else:
first = False
st = st + "%s" % cat
return st
#-----------
# Parsing lexicons
#-----------
# Separates the contents matching the first set of brackets
# from the rest of the input.
[docs]def matchBrackets(string):
rest = string[1:]
inside = "("
while rest != "" and not rest.startswith(')'):
if rest.startswith('('):
(part,rest) = matchBrackets(rest)
inside = inside + part
else:
inside = inside + rest[0]
rest = rest[1:]
if rest.startswith(')'):
return (inside + ')',rest[1:])
raise AssertionError('Unmatched bracket in string \'' + string + '\'')
# Separates the string for the next portion of the category
# from the rest of the string
[docs]def nextCategory(string):
if string.startswith('('):
return matchBrackets(string)
return reNextPrim.match(string).groups()
# Parses an application operator
[docs]def parseApplication(app):
return Direction(app[0],app[1:])
# Parses the subscripts for a primitive category
[docs]def parseSubscripts(subscr):
if subscr:
return subscr[1:-1].split(',')
return []
# Parse a primitive category
[docs]def parsePrimitiveCategory(chunks,primitives,families,var):
# If the primitive is the special category 'var',
# replace it with the correct CCGVar
if chunks[0] == "var":
if chunks[1] is None:
if var is None:
var = CCGVar()
return (var,var)
catstr = chunks[0]
if catstr in families:
(cat, cvar) = families[catstr]
if var is None:
var = cvar
else:
cat = cat.substitute([(cvar,var)])
return (cat,var)
if catstr in primitives:
subscrs = parseSubscripts(chunks[1])
return (PrimitiveCategory(catstr,subscrs),var)
raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.')
# parseCategory drops the 'var' from the tuple
[docs]def parseCategory(line,primitives,families):
return augParseCategory(line,primitives,families)[0]
# Parses a string representing a category, and returns
# a tuple with (possibly) the CCG variable for the category
[docs]def augParseCategory(line,primitives,families,var = None):
(str,rest) = nextCategory(line)
if str.startswith('('):
(res,var) = augParseCategory(str[1:-1],primitives,families,var)
else:
# print rePrim.match(str).groups()
(res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
while rest != "":
app = reApp.match(rest).groups()
dir = parseApplication(app[0:3])
rest = app[3]
(str,rest) = nextCategory(rest)
if str.startswith('('):
(arg,var) = augParseCategory(str[1:-1],primitives,families,var)
else:
(arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
res = FunctionalCategory(res,arg,dir)
return (res,var)
# Takes an input string, and converts it into a lexicon for CCGs.
[docs]def parseLexicon(lex_str):
primitives = []
families = {}
entries = defaultdict(list)
for line in lex_str.splitlines():
# Strip comments and leading/trailing whitespace.
line = reComm.match(line).groups()[0].strip()
if line == "":
continue
if line.startswith(':-'):
# A line of primitive categories.
# The first line is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
else:
# Either a family definition, or a word definition
(ident, sep, catstr) = reLex.match(line).groups()
(cat,var) = augParseCategory(catstr,primitives,families)
if sep == '::':
# Family definition
# ie, Det :: NP/N
families[ident] = (cat,var)
else:
# Word definition
# ie, which => (N\N)/(S/NP)
entries[ident].append(cat)
return CCGLexicon(primitives[0],primitives,families,entries)
openccg_tinytiny = parseLexicon('''
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
# Only incorporates a subset of the morphological subcategories, however.
:- S,NP,N # Primitive categories
Det :: NP/N # Determiners
Pro :: NP
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
IntransVpl :: S\\NP[pl] # Plural
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
TransVpl :: S\\NP[pl]/NP # Plural
the => NP[sg]/N[sg]
the => NP[pl]/N[pl]
I => Pro
me => Pro
we => Pro
us => Pro
book => N[sg]
books => N[pl]
peach => N[sg]
peaches => N[pl]
policeman => N[sg]
policemen => N[pl]
boy => N[sg]
boys => N[pl]
sleep => IntransVsg
sleep => IntransVpl
eat => IntransVpl
eat => TransVpl
eats => IntransVsg
eats => TransVsg
see => TransVpl
sees => TransVsg
''')