Source code for nltk.parse.util

# Natural Language Toolkit: Parser Utility Functions
#
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
#
# Copyright (C) 2001-2013 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT


"""
Utility functions for parsers.
"""
from __future__ import print_function

from nltk.grammar import ContextFreeGrammar, FeatureGrammar, WeightedGrammar
from nltk.data import load

from nltk.parse.chart import Chart, ChartParser
from nltk.parse.pchart import InsideChartParser
from nltk.parse.featurechart import FeatureChart, FeatureChartParser

[docs]def load_parser(grammar_url, trace=0,
                parser=None, chart_class=None,
                beam_size=0, **load_args):
    """
    Load a grammar from a file, and build a parser based on that grammar.
    The parser depends on the grammar format, and might also depend
    on properties of the grammar itself.

    The following grammar formats are currently supported:
      - ``'cfg'``  (CFGs: ``ContextFreeGrammar``)
      - ``'pcfg'`` (probabilistic CFGs: ``WeightedGrammar``)
      - ``'fcfg'`` (feature-based CFGs: ``ContextFreeGrammar``)

    :type grammar_url: str
    :param grammar_url: A URL specifying where the grammar is located.
        The default protocol is ``"nltk:"``, which searches for the file
        in the the NLTK data package.
    :type trace: int
    :param trace: The level of tracing that should be used when
        parsing a text.  ``0`` will generate no tracing output;
        and higher numbers will produce more verbose tracing output.
    :param parser: The class used for parsing; should be ``ChartParser``
        or a subclass.
        If None, the class depends on the grammar format.
    :param chart_class: The class used for storing the chart;
        should be ``Chart`` or a subclass.
        Only used for CFGs and feature CFGs.
        If None, the chart class depends on the grammar format.
    :type beam_size: int
    :param beam_size: The maximum length for the parser's edge queue.
        Only used for probabilistic CFGs.
    :param load_args: Keyword parameters used when loading the grammar.
        See ``data.load`` for more information.
    """
    grammar = load(grammar_url, **load_args)
    if not isinstance(grammar, ContextFreeGrammar):
        raise ValueError("The grammar must be a ContextFreeGrammar, "
                         "or a subclass thereof.")
    if isinstance(grammar, WeightedGrammar):
        if parser is None:
            parser = InsideChartParser
        return parser(grammar, trace=trace, beam_size=beam_size)

    elif isinstance(grammar, FeatureGrammar):
        if parser is None:
            parser = FeatureChartParser
        if chart_class is None:
            chart_class = FeatureChart
        return parser(grammar, trace=trace, chart_class=chart_class)

    else: # Plain ContextFreeGrammar.
        if parser is None:
            parser = ChartParser
        if chart_class is None:
            chart_class = Chart
        return parser(grammar, trace=trace, chart_class=chart_class)


######################################################################
#{ Test Suites
######################################################################

[docs]class TestGrammar(object):
    """
    Unit tests for  CFG.
    """
    def __init__(self, grammar, suite, accept=None, reject=None):
        self.test_grammar = grammar

        self.cp = load_parser(grammar, trace=0)
        self.suite = suite
        self._accept = accept
        self._reject = reject


[docs]    def run(self, show_trees=False):
        """
        Sentences in the test suite are divided into two classes:
         - grammatical (``accept``) and
         - ungrammatical (``reject``).
        If a sentence should parse accordng to the grammar, the value of
        ``trees`` will be a non-empty list. If a sentence should be rejected
        according to the grammar, then the value of ``trees`` will be None.
        """
        for test in self.suite:
            print(test['doc'] + ":", end=' ')
            for key in ['accept', 'reject']:
                for sent in test[key]:
                    tokens = sent.split()
                    trees = self.cp.parse(tokens)
                    if show_trees and trees:
                        print()
                        print(sent)
                        for tree in trees:
                            print(tree)
                    if key == 'accept':
                        if trees == []:
                            raise ValueError("Sentence '%s' failed to parse'" % sent)
                        else:
                            accepted = True
                    else:
                        if trees:
                            raise ValueError("Sentence '%s' received a parse'" % sent)
                        else:
                            rejected = True
            if accepted and rejected:
                print("All tests passed!")

[docs]def extract_test_sentences(string, comment_chars="#%;", encoding=None):
    """
    Parses a string with one test sentence per line.
    Lines can optionally begin with:
      - a bool, saying if the sentence is grammatical or not, or
      - an int, giving the number of parse trees is should have,
    The result information is followed by a colon, and then the sentence.
    Empty lines and lines beginning with a comment char are ignored.

    :return: a list of tuple of sentences and expected results,
        where a sentence is a list of str,
        and a result is None, or bool, or int

    :param comment_chars: ``str`` of possible comment characters.
    :param encoding: the encoding of the string, if it is binary
    """
    if encoding is not None:
        string = string.decode(encoding)
    sentences = []
    for sentence in string.split('\n'):
        if sentence == '' or sentence[0] in comment_chars:
            continue
        split_info = sentence.split(':', 1)
        result = None
        if len(split_info) == 2:
            if split_info[0] in ['True','true','False','false']:
                result = split_info[0] in ['True','true']
                sentence = split_info[1]
            else:
                result = int(split_info[0])
                sentence = split_info[1]
        tokens = sentence.split()
        if tokens == []:
            continue
        sentences += [(tokens, result)]
    return sentences

# nose thinks it is a test
extract_test_sentences.__test__ = False
Source code for nltk.parse.util

Table Of Contents

Search