I'm using nltk for a personal project. It's a great library providing many tools for natural language processing. It provides different kinds of tokenizers but these tokenizers only cut string into substring without keeping track of location or other useful metadata. I needed to have tokens location (line and column number of the token) in the original text so I wrote this simple tokenizer imitating the function nltk.wordpunct_tokenize:
import re
def wordpunct_tokenize_position(stream):
"""
Tokenize and store location of tokens from a stream or a string
>>> list(wordpunct_tokenize_position('nltk is great'))
[('nltk', (0, 0)), ('is', (0, 5)), ('great', (0, 8))]
>>> list(wordpunct_tokenize_position('nltk\\nis\\ngreat'))
[('nltk', (0, 0)), ('is', (1, 0)), ('great', (2, 0))]
>>> list(wordpunct_tokenize_position('nltk is nltk'))
[('nltk', (0, 0)), ('is', (0, 5)), ('nltk', (0, 8))]
"""
if isinstance(stream, basestring):
sourceiterable = stream.splitlines() # not an iterator
else:
sourceiterable = stream.readlines()
regex = re.compile(r'(\w+|[^\w\s]+)')
for line_number, line in enumerate(sourceiterable):
for match in regex.finditer(line):
yield match.group(1), (line_number, match.start())
if __name__ == "__main__":
import doctest
doctest.testmod()