Source code for nltk.corpus.reader.udhr

# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""
from __future__ import absolute_import, unicode_literals

from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

[docs]class UdhrCorpusReader(PlaintextCorpusReader): ENCODINGS = [ ('.*-Latin1$', 'latin-1'), ('.*-Hebrew$', 'hebrew'), ('.*-Arabic$', 'cp1256'), ('Czech_Cesky-UTF8', 'cp1250'), # yeah ('.*-Cyrillic$', 'cyrillic'), ('.*-SJIS$', 'SJIS'), ('.*-GB2312$', 'GB2312'), ('.*-Latin2$', 'ISO-8859-2'), ('.*-Greek$', 'greek'), ('.*-UTF8$', 'utf-8'), ('Hungarian_Magyar-Unicode', 'utf-16-le'), ('Amahuaca', 'latin1'), ('Turkish_Turkce-Turkish', 'latin5'), ('Lithuanian_Lietuviskai-Baltic', 'latin4'), ('Japanese_Nihongo-EUC', 'EUC-JP'), ('Japanese_Nihongo-JIS', 'iso2022_jp'), ('Chinese_Mandarin-HZ', 'hz'), ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'), ] SKIP = set([ # The following files are not fully decodable because they # were truncated at wrong bytes: 'Burmese_Myanmar-UTF8', 'Japanese_Nihongo-JIS', 'Chinese_Mandarin-HZ', 'Chinese_Mandarin-UTF8', 'Gujarati-UTF8', 'Hungarian_Magyar-Unicode', 'Lao-UTF8', 'Magahi-UTF8', 'Marathi-UTF8', 'Tamil-UTF8', # Unfortunately, encodings required for reading # the following files are not supported by Python: 'Vietnamese-VPS', 'Vietnamese-VIQR', 'Vietnamese-TCVN', 'Magahi-Agra', 'Bhojpuri-Agra', 'Esperanto-T61', # latin3 raises an exception # The following files are encoded for specific fonts: 'Burmese_Myanmar-WinResearcher', 'Armenian-DallakHelv', 'Tigrinya_Tigrigna-VG2Main', 'Amharic-Afenegus6..60375', # ? 'Navaho_Dine-Navajo-Navaho-font', # What are these? 'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117', 'Azeri_Azerbaijani_Latin-Az.Times.Lat0117', # The following files are unintended: 'Czech-Latin2-err', 'Russian_Russky-UTF8~', ]) def __init__(self, root='udhr'): fileids = find_corpus_fileids(root, r'(?!README|\.).*') super(UdhrCorpusReader, self).__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS )