Source code for nltk.test.unit.test_corpora

# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
from nltk.corpus import (sinica_treebank, conll2007, indian, cess_cat, cess_esp,
                         floresta, ptb, udhr)
from nltk.tree import Tree
from .utils import skipIf


[docs]class TestUdhr(unittest.TestCase):
[docs] def test_words(self): for name in udhr.fileids(): try: words = list(udhr.words(name)) except AssertionError: print(name) raise self.assertTrue(words)
[docs] def test_raw_unicode(self): for name in udhr.fileids(): txt = udhr.raw(name) assert not isinstance(txt, bytes), name
[docs]class TestIndian(unittest.TestCase):
[docs] def test_words(self): words = indian.words()[:3] self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
[docs] def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual(tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')])
[docs]class TestCess(unittest.TestCase):
[docs] def test_catalan(self): words = cess_cat.words()[:15] txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" self.assertEqual(words, txt.split())
[docs] def test_esp(self): words = cess_esp.words()[:15] txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" self.assertEqual(words, txt.split())
[docs]class TestFloresta(unittest.TestCase):
[docs] def test_words(self): words = floresta.words()[:10] txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a" self.assertEqual(words, txt.split())
[docs]class TestSinicaTreebank(unittest.TestCase):
[docs] def test_sents(self): first_3_sents = sinica_treebank.sents()[:3] self.assertEqual( first_3_sents, [['一'], ['友情'], ['嘉珍', '和', '我', '住在', '同一條', '巷子']] )
[docs] def test_parsed_sents(self): parsed_sents = sinica_treebank.parsed_sents()[25] self.assertEqual(parsed_sents, Tree('S', [ Tree('NP', [ Tree('Nba', ['嘉珍']) ]), Tree('V‧地', [ Tree('VA11', ['不停']), Tree('DE', ['的']) ]), Tree('VA4', ['哭泣']) ]))
[docs]class TestCoNLL2007(unittest.TestCase): # Reading the CoNLL 2007 Dependency Treebanks
[docs] def test_sents(self): sents = conll2007.sents('esp.train')[0] self.assertEqual( sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo'] )
[docs] def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents('esp.train')[0] self.assertEqual(parsed_sents.tree(), Tree('fortaleció', [ Tree('aumento', [ 'El', Tree('del', [ Tree('índice', [ Tree('de', [ Tree('desempleo', ['estadounidense']) ]) ]) ]) ]), 'hoy', 'considerablemente', Tree('al', [ Tree('euro', [ Tree('cotizaba', [ ',', 'que', Tree('a', [ Tree('15.35', ['las', 'GMT']) ]), 'se', Tree('en', [ Tree('mercado', [ 'el', Tree('de', ['divisas']), Tree('de', ['Fráncfort']) ]) ]), Tree('a', ['0,9452_dólares']), Tree('frente_a', [ ',', Tree('0,9349_dólares', [ 'los', Tree('de', [ Tree('mañana', ['esta']) ]) ]) ]) ]) ]) ]), '.' ]) )
@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
[docs]class TestPTB(unittest.TestCase):
[docs] def test_fileids(self): self.assertEqual( ptb.fileids()[:4], ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG'] )
[docs] def test_words(self): self.assertEqual( ptb.words('WSJ/00/WSJ_0003.MRG')[:7], ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'] )
[docs] def test_tagged_words(self): self.assertEqual( ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3], [('A', 'DT'), ('form', 'NN'), ('of', 'IN')] )
[docs] def test_categories(self): self.assertEqual( ptb.categories(), ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction'] )
[docs] def test_news_fileids(self): self.assertEqual( ptb.fileids('news')[:3], ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'] )
[docs] def test_category_words(self): self.assertEqual( ptb.words(categories=['humor','fiction'])[:6], ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'] ) # unload corpora
from nltk.corpus import teardown_module