Source code for nltk.model.ngram

# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Steven Bird <stevenbird1@gmail.com>
#          Daniel Blanchard <dblanchard@ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals

from itertools import chain
from math import log

from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
                              SimpleGoodTuringProbDist)
from nltk.util import ngrams
from nltk.model.api import ModelI

from nltk import compat


def _estimator(fdist, bins):
    """
    Default estimator function using a SimpleGoodTuringProbDist.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    return SimpleGoodTuringProbDist(fdist)


@compat.python_2_unicode_compatible
[docs]class NgramModel(ModelI): """ A processing interface for assigning a probability to the next word. """ # add cutoff def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. >>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 0.5776... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] for sent in train: for ngram in ngrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
[docs] def prob(self, word, context): """ Evaluate the probability of this word in this context using Katz Backoff. :param word: the word to get the probability of :type word: str :param context: the context the word is in :type context: list(str) """ context = tuple(context) if (context + (word,) in self._ngrams) or (self._n == 1): return self[context].prob(word) else: return self._alpha(context) * self._backoff.prob(word, context[1:])
def _alpha(self, tokens): return self._beta(tokens) / self._backoff._beta(tokens[1:]) def _beta(self, tokens): return (self[tokens].discount() if tokens in self else 1)
[docs] def logprob(self, word, context): """ Evaluate the (negative) log probability of this word in this context. :param word: the word to get the probability of :type word: str :param context: the context the word is in :type context: list(str) """ return -log(self.prob(word, context), 2)
[docs] def choose_random_word(self, context): ''' Randomly select a word that is likely to appear in this context. :param context: the context the word is in :type context: list(str) ''' return self.generate(1, context)[-1] # NB, this will always start with same word if the model # was trained on a single text
[docs] def generate(self, num_words, context=()): ''' Generate random text based on the language model. :param num_words: number of words to generate :type num_words: int :param context: initial words in generated string :type context: list(str) ''' text = list(context) for i in range(num_words): text.append(self._generate_one(text)) return text
def _generate_one(self, context): context = (self._lpad + tuple(context))[-self._n+1:] # print "Context (%d): <%s>" % (self._n, ','.join(context)) if context in self: return self[context].generate() elif self._n > 1: return self._backoff._generate_one(context[1:]) else: return '.'
[docs] def entropy(self, text): """ Calculate the approximate cross-entropy of the n-gram model for a given evaluation text. This is the average log probability of each word in the text. :param text: words to use for evaluation :type text: list(str) """ e = 0.0 text = list(self._lpad) + text + list(self._rpad) for i in range(self._n-1, len(text)): context = tuple(text[i-self._n+1:i]) token = text[i] e += self.logprob(token, context) return e / float(len(text) - (self._n-1))
[docs] def perplexity(self, text): """ Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text. :param text: words to calculate perplexity of :type text: list(str) """ return pow(2.0, self.entropy(text))
def __contains__(self, item): return tuple(item) in self._model def __getitem__(self, item): return self._model[tuple(item)] def __repr__(self): return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
[docs]def teardown_module(module=None): from nltk.corpus import brown brown._unload()
if __name__ == "__main__": import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)