Part of Speech Tagger

This notebook contains model for tagging POS in an English sentence. There are many POS tags. The model converts the sentence to POS tags. We will use universal tagset for this task. Tags used are:

ADJ - Adjective

ADP - Adposition

ADV - Adverb

PRT - Particle

PRON - Pronoun

. - Punctuation marks

X - Other

VERB - Verb

CONJ - Conjunction

DET - Determiner / Article
NOUN - Noun

NUM - Numeral

import nltk
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# import training data. 
# We will be using the nltk data 
nltk.download('brown')
nltk.download('universal_tagset')

# load training data from nltk library
all_tags = ['<EOS>','<UNK>','ADV', 'NOUN', 'ADP', 'PRON', 'DET',
            '.', 'PRT', 'VERB', 'X', 'NUM', 'CONJ', 'ADJ']
data = nltk.corpus.brown.tagged_sents(tagset='universal')
print(len(data))

[nltk_data] Downloading package brown to /content/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /content/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
57340

data

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('of', 'ADP'), ('Atlanta', 'NOUN'), ("''", '.'), ('for', 'ADP'), ('the', 'DET'), ('manner', 'NOUN'), ('in', 'ADP'), ('which', 'DET'), ('the', 'DET'), ('election', 'NOUN'), ('was', 'VERB'), ('conducted', 'VERB'), ('.', '.')], ...]

# convert the data into list of (word, tag) format for each sentence in the train_data
data = [[(word.lower(), tag) for word, tag in sentence] for sentence in data]
data[1]

[('the', 'DET'),
 ('jury', 'NOUN'),
 ('further', 'ADV'),
 ('said', 'VERB'),
 ('in', 'ADP'),
 ('term-end', 'NOUN'),
 ('presentments', 'NOUN'),
 ('that', 'ADP'),
 ('the', 'DET'),
 ('city', 'NOUN'),
 ('executive', 'ADJ'),
 ('committee', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('had', 'VERB'),
 ('over-all', 'ADJ'),
 ('charge', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('election', 'NOUN'),
 (',', '.'),
 ('``', '.'),
 ('deserves', 'VERB'),
 ('the', 'DET'),
 ('praise', 'NOUN'),
 ('and', 'CONJ'),
 ('thanks', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('city', 'NOUN'),
 ('of', 'ADP'),
 ('atlanta', 'NOUN'),
 ("''", '.'),
 ('for', 'ADP'),
 ('the', 'DET'),
 ('manner', 'NOUN'),
 ('in', 'ADP'),
 ('which', 'DET'),
 ('the', 'DET'),
 ('election', 'NOUN'),
 ('was', 'VERB'),
 ('conducted', 'VERB'),
 ('.', '.')]