Source code for nltk.corpus.reader.bracket_parse

# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
#         Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""

import sys

from nltk.tree import Tree

from .util import *
from .api import *


# we use [^\s()]+ instead of \S+? to avoid matching ()
TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')

[docs]class BracketParseCorpusReader(SyntaxCorpusReader): """ Reader for corpora that consist of parenthesis-delineated parse trees. """ def __init__(self, root, fileids, comment_char=None, detect_blocks='unindented_paren', encoding='utf8', tagset=None): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param comment_char: The character which can appear at the start of a line to indicate that the rest of the line is a comment. :param detect_blocks: The method that is used to find blocks in the corpus; can be 'unindented_paren' (every unindented parenthesis starts a new parse) or 'sexpr' (brackets are matched). :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the tagged_...() methods. """ CorpusReader.__init__(self, root, fileids, encoding) self._comment_char = comment_char self._detect_blocks = detect_blocks self._tagset = tagset def _read_block(self, stream): if self._detect_blocks == 'sexpr': return read_sexpr_block(stream, comment_char=self._comment_char) elif self._detect_blocks == 'blankline': return read_blankline_block(stream) elif self._detect_blocks == 'unindented_paren': # Tokens start with unindented left parens. toks = read_regexp_block(stream, start_re=r'^\(') # Strip any comments out of the tokens. if self._comment_char: toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char), '', tok) for tok in toks] return toks else: assert 0, 'bad block type' def _normalize(self, t): # If there's an empty set of brackets surrounding the actual # parse, then strip them off. if EMPTY_BRACKETS.match(t): t = t.strip()[1:-1] # Replace leaves of the form (!), (,), with (! !), (, ,) t = re.sub(r"\((.)\)", r"(\1 \1)", t) # Replace leaves of the form (tag word root) with (tag word) t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) return t def _parse(self, t): try: return Tree.parse(self._normalize(t)) except ValueError as e: sys.stderr.write("Bad tree detected; trying to recover...\n") # Try to recover, if we can: if e.args == ('mismatched parens',): for n in range(1, 5): try: v = Tree.parse(self._normalize(t+')'*n)) sys.stderr.write(" Recovered by adding %d close " "paren(s)\n" % n) return v except ValueError: pass # Try something else: sys.stderr.write(" Recovered by returning a flat parse.\n") #sys.stderr.write(' '.join(t.split())+'\n') return Tree('S', self._tag(t)) def _tag(self, t, tagset=None): tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))] if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent] return tagged_sent def _word(self, t): return WORD.findall(self._normalize(t))
[docs]class CategorizedBracketParseCorpusReader(CategorizedCorpusReader, BracketParseCorpusReader): """ A reader for parsed corpora whose documents are divided into categories based on their file identifiers. @author: Nathan Schneider <nschneid@cs.cmu.edu> """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to the L{CategorizedCorpusReader constructor <CategorizedCorpusReader.__init__>}. The remaining arguments are passed to the L{BracketParseCorpusReader constructor <BracketParseCorpusReader.__init__>}. """ CategorizedCorpusReader.__init__(self, kwargs) BracketParseCorpusReader.__init__(self, *args, **kwargs) def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError('Specify fileids or categories, not both') if categories is not None: return self.fileids(categories) else: return fileids
[docs] def raw(self, fileids=None, categories=None): return BracketParseCorpusReader.raw( self, self._resolve(fileids, categories))
[docs] def words(self, fileids=None, categories=None): return BracketParseCorpusReader.words( self, self._resolve(fileids, categories))
[docs] def sents(self, fileids=None, categories=None): return BracketParseCorpusReader.sents( self, self._resolve(fileids, categories))
[docs] def paras(self, fileids=None, categories=None): return BracketParseCorpusReader.paras( self, self._resolve(fileids, categories))
[docs] def tagged_words(self, fileids=None, categories=None, tagset=None): return BracketParseCorpusReader.tagged_words( self, self._resolve(fileids, categories), tagset)
[docs] def tagged_sents(self, fileids=None, categories=None, tagset=None): return BracketParseCorpusReader.tagged_sents( self, self._resolve(fileids, categories), tagset)
[docs] def tagged_paras(self, fileids=None, categories=None, tagset=None): return BracketParseCorpusReader.tagged_paras( self, self._resolve(fileids, categories), tagset)
[docs] def parsed_words(self, fileids=None, categories=None): return BracketParseCorpusReader.parsed_words( self, self._resolve(fileids, categories))
[docs] def parsed_sents(self, fileids=None, categories=None): return BracketParseCorpusReader.parsed_sents( self, self._resolve(fileids, categories))
[docs] def parsed_paras(self, fileids=None, categories=None): return BracketParseCorpusReader.parsed_paras( self, self._resolve(fileids, categories))
[docs]class AlpinoCorpusReader(BracketParseCorpusReader): """ Reader for the Alpino Dutch Treebank. """ def __init__(self, root, encoding='ISO-8859-1', tagset=None): BracketParseCorpusReader.__init__(self, root, 'alpino\.xml', detect_blocks='blankline', encoding=encoding, tagset=tagset) def _normalize(self, t): if t[:10] != "<alpino_ds": return "" # convert XML to sexpr notation t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t) t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t) t = re.sub(r" </node>", r")", t) t = re.sub(r"<sentence>.*</sentence>", r"", t) t = re.sub(r"</?alpino_ds.*>", r"", t) return t