Learn practical skills, build real-world projects, and advance your career
Updated 3 years ago
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words=set(stopwords.words('english'))
file1=open("Text_Brown_corpus.txt")
line=file1.read()
words=line.split()
print("Total words present",len(words))
for r in words:
if not r in stop_words:
print(r)
appendFile=open("filtered.txt",'a')
appendFile.write(" "+r)
appendFile.close()
file=open("filtered.txt","r")
words=line.split(" ")
print("Total words present in new file",len(words))
file.close()
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
file_content1 = open("Text_Brown_corpus.txt").read()
words = word_tokenize(file_content1)
ps = PorterStemmer()
for w in words:
rootWord=ps.stem(w)
print(rootWord)
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
text = open("Text_Brown_corpus.txt").read()
tokens = word_tokenize(text)
lemma_function = WordNetLemmatizer()
for token, tag in pos_tag(tokens):
lemma = lemma_function.lemmatize(token, tag_map[tag[0]])
print(token, "=>", lemma)