Spaces:
Sleeping
Sleeping
File size: 2,305 Bytes
bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 45c58d0 bcf5ff3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# Import required libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize, TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import NLTKWordTokenizer
# Test cases for NLTKWordTokenizer
s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
print(word_tokenize(s1))
s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
print(word_tokenize(s2))
s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
print(word_tokenize(s3))
s4 = "I cannot cannot work under these conditions!"
print(word_tokenize(s4))
s5 = "The company spent $30,000,000 last year."
print(word_tokenize(s5))
s6 = "The company spent 40.75% of its income last year."
print(word_tokenize(s6))
s7 = "He arrived at 3:00 pm."
print(word_tokenize(s7))
s8 = "I bought these items: books, pencils, and pens."
print(word_tokenize(s8))
s9 = "Though there were 150, 100 of them were old."
print(word_tokenize(s9))
s10 = "There were 300,000, but that wasn't enough."
print(word_tokenize(s10))
s11 = "It's more'n enough."
print(word_tokenize(s11))
s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
(24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
(40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
(60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
print(list(NLTKWordTokenizer().span_tokenize(s)) == expected)
expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
print([s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected)
sx1 = '\xabNow that I can do.\xbb'
expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
print(word_tokenize(sx1) == expected)
sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT
|