Source code for nltk.corpus.reader.udhr

# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""
from __future__ import absolute_import, unicode_literals

from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

[docs]class UdhrCorpusReader(PlaintextCorpusReader):

    ENCODINGS = [
        ('.*-Latin1$', 'latin-1'),
        ('.*-Hebrew$', 'hebrew'),
        ('.*-Arabic$', 'cp1256'),
        ('Czech_Cesky-UTF8', 'cp1250'), # yeah
        ('.*-Cyrillic$', 'cyrillic'),
        ('.*-SJIS$', 'SJIS'),
        ('.*-GB2312$', 'GB2312'),
        ('.*-Latin2$', 'ISO-8859-2'),
        ('.*-Greek$', 'greek'),
        ('.*-UTF8$', 'utf-8'),

        ('Hungarian_Magyar-Unicode', 'utf-16-le'),
        ('Amahuaca', 'latin1'),
        ('Turkish_Turkce-Turkish', 'latin5'),
        ('Lithuanian_Lietuviskai-Baltic', 'latin4'),
        ('Japanese_Nihongo-EUC', 'EUC-JP'),
        ('Japanese_Nihongo-JIS', 'iso2022_jp'),
        ('Chinese_Mandarin-HZ', 'hz'),
        ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
    ]

    SKIP = set([
        # The following files are not fully decodable because they
        # were truncated at wrong bytes:
        'Burmese_Myanmar-UTF8',
        'Japanese_Nihongo-JIS',
        'Chinese_Mandarin-HZ',
        'Chinese_Mandarin-UTF8',
        'Gujarati-UTF8',
        'Hungarian_Magyar-Unicode',
        'Lao-UTF8',
        'Magahi-UTF8',
        'Marathi-UTF8',
        'Tamil-UTF8',

        # Unfortunately, encodings required for reading
        # the following files are not supported by Python:
        'Vietnamese-VPS',
        'Vietnamese-VIQR',
        'Vietnamese-TCVN',
        'Magahi-Agra',
        'Bhojpuri-Agra',
        'Esperanto-T61', # latin3 raises an exception

        # The following files are encoded for specific fonts:
        'Burmese_Myanmar-WinResearcher',
        'Armenian-DallakHelv',
        'Tigrinya_Tigrigna-VG2Main',
        'Amharic-Afenegus6..60375', # ?
        'Navaho_Dine-Navajo-Navaho-font',

        # What are these?
        'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
        'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',

        # The following files are unintended:
        'Czech-Latin2-err',
        'Russian_Russky-UTF8~',
    ])


    def __init__(self, root='udhr'):
        fileids = find_corpus_fileids(root, r'(?!README|\.).*')
        super(UdhrCorpusReader, self).__init__(
            root,
            [fileid for fileid in fileids if fileid not in self.SKIP],
            encoding=self.ENCODINGS
        )
Source code for nltk.corpus.reader.udhr

Table Of Contents

Search