Learn practical skills, build real-world projects, and advance your career
Created 3 years ago
Part of Speech Tagger
This notebook contains model for tagging POS in an English sentence. There are many POS tags. The model converts the sentence to POS tags. We will use universal tagset for this task. Tags used are:
ADJ - Adjective
ADP - Adposition
ADV - Adverb
PRT - Particle
PRON - Pronoun
. - Punctuation marks
X - Other
VERB - Verb
CONJ - Conjunction
DET - Determiner / Article
NOUN - Noun
NUM - Numeral
import nltk
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
# import training data.
# We will be using the nltk data
nltk.download('brown')
nltk.download('universal_tagset')
# load training data from nltk library
all_tags = ['<EOS>','<UNK>','ADV', 'NOUN', 'ADP', 'PRON', 'DET',
'.', 'PRT', 'VERB', 'X', 'NUM', 'CONJ', 'ADJ']
data = nltk.corpus.brown.tagged_sents(tagset='universal')
print(len(data))
[nltk_data] Downloading package brown to /content/nltk_data...
[nltk_data] Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data] /content/nltk_data...
[nltk_data] Unzipping taggers/universal_tagset.zip.
57340
data
[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('of', 'ADP'), ('Atlanta', 'NOUN'), ("''", '.'), ('for', 'ADP'), ('the', 'DET'), ('manner', 'NOUN'), ('in', 'ADP'), ('which', 'DET'), ('the', 'DET'), ('election', 'NOUN'), ('was', 'VERB'), ('conducted', 'VERB'), ('.', '.')], ...]
# convert the data into list of (word, tag) format for each sentence in the train_data
data = [[(word.lower(), tag) for word, tag in sentence] for sentence in data]
data[1]
[('the', 'DET'),
('jury', 'NOUN'),
('further', 'ADV'),
('said', 'VERB'),
('in', 'ADP'),
('term-end', 'NOUN'),
('presentments', 'NOUN'),
('that', 'ADP'),
('the', 'DET'),
('city', 'NOUN'),
('executive', 'ADJ'),
('committee', 'NOUN'),
(',', '.'),
('which', 'DET'),
('had', 'VERB'),
('over-all', 'ADJ'),
('charge', 'NOUN'),
('of', 'ADP'),
('the', 'DET'),
('election', 'NOUN'),
(',', '.'),
('``', '.'),
('deserves', 'VERB'),
('the', 'DET'),
('praise', 'NOUN'),
('and', 'CONJ'),
('thanks', 'NOUN'),
('of', 'ADP'),
('the', 'DET'),
('city', 'NOUN'),
('of', 'ADP'),
('atlanta', 'NOUN'),
("''", '.'),
('for', 'ADP'),
('the', 'DET'),
('manner', 'NOUN'),
('in', 'ADP'),
('which', 'DET'),
('the', 'DET'),
('election', 'NOUN'),
('was', 'VERB'),
('conducted', 'VERB'),
('.', '.')]