Source code for nltk.corpus.reader.wordlist
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk import compat
from nltk.tokenize import line_tokenize
from .util import *
from .api import *
[docs]class WordListCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
[docs] def words(self, fileids=None):
return line_tokenize(self.raw(fileids))
[docs] def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
elif isinstance(fileids, compat.string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
[docs]class SwadeshCorpusReader(WordListCorpusReader):
[docs] def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))