Shortcuts

Source code for torchtext.experimental.datasets.language_modeling

import torch
import logging
import io
from torchtext.utils import download_from_url, extract_archive
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.data.functional import numericalize_tokens_from_iterator

URLS = {
    'WikiText2':
        'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',
    'WikiText103':
        'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip',
    'PennTreebank':
        ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
         'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt',
         'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt']
}


[docs]class LanguageModelingDataset(torch.utils.data.Dataset): """Defines a dataset for language modeling. Currently, we only support the following datasets: - WikiText2 - WikiText103 - PennTreebank """
[docs] def __init__(self, data, vocab): """Initiate language modeling dataset. Arguments: data: a tensor of tokens. tokens are ids after numericalizing the string tokens. torch.tensor([token_id_1, token_id_2, token_id_3, token_id1]).long() vocab: Vocabulary object used for dataset. Examples: >>> from torchtext.vocab import build_vocab_from_iterator >>> data = torch.tensor([token_id_1, token_id_2, token_id_3, token_id_1]).long() >>> vocab = build_vocab_from_iterator([['language', 'modeling']]) >>> dataset = LanguageModelingDataset(data, vocab) """ super(LanguageModelingDataset, self).__init__() self.data = data self.vocab = vocab
def __getitem__(self, i): return self.data[i] def __len__(self): return len(self.data) def __iter__(self): for x in self.data: yield x def get_vocab(self): return self.vocab
def _get_datafile_path(key, extracted_files): for fname in extracted_files: if key in fname: return fname def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], data_select=('train', 'test', 'valid')): if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'test', 'valid'))): raise TypeError('data_select is not supported!') if dataset_name == 'PennTreebank': extracted_files = [] select_to_index = {'train': 0, 'test': 1, 'valid': 2} extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], root=root) for key in data_select] else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) _path = {} for item in data_select: _path[item] = _get_datafile_path(item, extracted_files) if vocab is None: if 'train' not in _path.keys(): raise TypeError("Must pass a vocab if train is not selected.") logging.info('Building Vocab based on {}'.format(_path['train'])) txt_iter = iter(tokenizer(row) for row in io.open(_path['train'], encoding="utf8")) vocab = build_vocab_from_iterator(txt_iter) logging.info('Vocab has {} entries'.format(len(vocab))) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") data = {} for item in _path.keys(): data[item] = [] logging.info('Creating {} data'.format(item)) txt_iter = iter(tokenizer(row) for row in io.open(_path[item], encoding="utf8")) _iter = numericalize_tokens_from_iterator( vocab, txt_iter, removed_tokens) for tokens in _iter: data[item] += [token_id for token_id in tokens] for key in data_select: if data[key] == []: raise TypeError('Dataset {} is empty!'.format(key)) return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) for d in data_select)
[docs]def WikiText2(*args, **kwargs): """ Defines WikiText2 datasets. Create language modeling dataset: WikiText2 Separately returns the train/test/valid set Arguments: tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. Examples: >>> from torchtext.experimental.datasets import WikiText2 >>> from torchtext.data.utils import get_tokenizer >>> tokenizer = get_tokenizer("spacy") >>> train_dataset, test_dataset, valid_dataset = WikiText2(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText2(tokenizer=tokenizer, vocab=vocab, data_select='valid') """ return _setup_datasets(*(("WikiText2",) + args), **kwargs)
[docs]def WikiText103(*args, **kwargs): """ Defines WikiText103 datasets. Create language modeling dataset: WikiText103 Separately returns the train/test/valid set Arguments: tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. data_select: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will be used to process valid and/or test data. removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. Examples: >>> from torchtext.experimental.datasets import WikiText103 >>> from torchtext.data.utils import get_tokenizer >>> tokenizer = get_tokenizer("spacy") >>> train_dataset, test_dataset, valid_dataset = WikiText103(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText103(tokenizer=tokenizer, vocab=vocab, data_select='valid') """ return _setup_datasets(*(("WikiText103",) + args), **kwargs)
[docs]def PennTreebank(*args, **kwargs): """ Defines PennTreebank datasets. Create language modeling dataset: PennTreebank Separately returns the train/test/valid set Arguments: tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. Examples: >>> from torchtext.experimental.datasets import PennTreebank >>> from torchtext.data.utils import get_tokenizer >>> tokenizer = get_tokenizer("spacy") >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = PennTreebank(tokenizer=tokenizer, vocab=vocab, data_select='valid') """ return _setup_datasets(*(("PennTreebank",) + args), **kwargs)

Docs

Access comprehensive developer documentation for PyTorch

View Docs

Tutorials

Get in-depth tutorials for beginners and advanced developers

View Tutorials

Resources

Find development resources and get your questions answered

View Resources