# Natural Language Toolkit: Dependency Corpus Reader
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
# Iker Manterola <returntothehangar@hotmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import codecs
from nltk.parse import DependencyGraph
from nltk.tokenize import *
from .util import *
from .api import *
[docs]class DependencyCorpusReader(SyntaxCorpusReader):
def __init__(self, root, fileids, encoding='utf8',
word_tokenizer=TabTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block):
CorpusReader.__init__(self, root, fileids, encoding)
#########################################################
[docs] def raw(self, fileids=None):
"""
:return: the given file(s) as a single string.
:rtype: str
"""
result = []
for fileid, encoding in self.abspaths(fileids, include_encoding=True):
if isinstance(fileid, PathPointer):
result.append(fileid.open(encoding=encoding).read())
else:
with codecs.open(fileid, "r", encoding) as fp:
result.append(fp.read())
return concat(result)
[docs] def words(self, fileids=None):
return concat([DependencyCorpusView(fileid, False, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)])
[docs] def tagged_words(self, fileids=None):
return concat([DependencyCorpusView(fileid, True, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)])
[docs] def sents(self, fileids=None):
return concat([DependencyCorpusView(fileid, False, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)])
[docs] def tagged_sents(self, fileids=None):
return concat([DependencyCorpusView(fileid, True, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)])
[docs] def parsed_sents(self, fileids=None):
sents=concat([DependencyCorpusView(fileid, False, True, True, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)])
return [DependencyGraph(sent) for sent in sents]
[docs]class DependencyCorpusView(StreamBackedCorpusView):
_DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da
def __init__(self, corpus_file, tagged, group_by_sent, dependencies,
chunk_types=None, encoding='utf8'):
self._tagged = tagged
self._dependencies = dependencies
self._group_by_sent = group_by_sent
self._chunk_types = chunk_types
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
[docs] def read_block(self, stream):
# Read the next sentence.
sent = read_blankline_block(stream)[0].strip()
# Strip off the docstart marker, if present.
if sent.startswith(self._DOCSTART):
sent = sent[len(self._DOCSTART):].lstrip()
# extract word and tag from any of the formats
if not self._dependencies:
lines = [line.split('\t') for line in sent.split('\n')]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
raise ValueError('Unexpected number of fields in dependency tree file')
# discard tags if they weren't requested
if not self._tagged:
sent = [word for (word, tag) in sent]
# Return the result.
if self._group_by_sent:
return [sent]
else:
return list(sent)