In [7]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import tflearn
import tensorflow as tf
import random
import json
import string
import unicodedata
import sys
In [52]:
# a table structure to hold the different punctuation used
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith('P'))
# method to remove punctuations from sentences.
def remove_punctuation(text):
return text
#return text.translate(tbl)
# initialize the stemmer
stemmer = LancasterStemmer()
# variable to hold the Json data read from the file
data = None
# read the json file and load the training data
with open('traindata.json') as json_data:
data = json.load(json_data)
print(data)
# get a list of all categories to train for
categories = list(data.keys())
words = []
# a list of tuples with words in the sentence and category name
docs = []
for each_category in data.keys():
for each_sentence in data[each_category]:
# remove any punctuation from the sentence
each_sentence = remove_punctuation(each_sentence)
print(each_sentence)
# extract words from each sentence and append to the word list
w = nltk.word_tokenize(each_sentence)
print("tokenized words: ", w)
words.extend(w)
docs.append((w, each_category))
# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words]
words = sorted(list(set(words)))
print(words)
print(docs)
In [57]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(categories)
for doc in docs:
# initialize our bag of words(bow) for each document in the list
bow = []
# list of tokenized words for the pattern
token_words = doc[0]
# stem each word
token_words = [stemmer.stem(word.lower()) for word in token_words]
# create our bag of words array
for w in words:
bow.append(1) if w in token_words else bow.append(0)
output_row = list(output_empty)
output_row[categories.index(doc[1])] = 1
# our training set will contain a the bag of words model and the output row that tells
# which catefory that bow belongs to.
training.append([bow, output_row])
# shuffle our features and turn into np.array as tensorflow takes in numpy array
random.shuffle(training)
training = np.array(training)
# trainX contains the Bag of words and train_y contains the label/ category
train_x = list(training[:, 0])
train_y = list(training[:, 1])
In [54]:
## reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 24)
net = tflearn.fully_connected(net, 24)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)
# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=25000, batch_size=4, show_metric=True)
model.save('model.tflearn')
In [55]:
# let's test the mdodel for a few sentences:
# the first two sentences are used for training, and the last two sentences are not present in the training data.
sent_1 = "sandeepkanao@gmail.com"
sent_2 = "123 Main Parkway Vancouver BC"
sent_3 = "B.Sc. Computer Science"
sent_4 = "University of New York"
sent_5 = "jonsmith@mail.com"
sent_6 = "780678709"
sent_7 = "ASP.NET"
sent_8 = "Jon Smith"
sent_9 = "17 MNC Cres Toronto ON"
sent_10 = "2013 - Present Royal Bank Of Canada"
sent_11 = "2012 - 2013 Vancouver, BC Software Developer BC Development"
# a method that takes in a sentence and list of all words
# and returns the data in a form the can be fed to tensorflow
def get_tf_record(sentence):
global words
# tokenize the pattern
sentence_words = nltk.word_tokenize(sentence)
# stem each word
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
# bag of words
bow = [0]*len(words)
for s in sentence_words:
for i, w in enumerate(words):
if w == s:
bow[i] = 1
return(np.array(bow))
# we can start to predict the results for each of the sentences
print(categories[np.argmax(model.predict([get_tf_record(sent_1)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_2)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_3)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_4)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_5)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_6)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_7)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_8)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_9)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_10)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_11)]))])
Software development Vancouver BC Canada
ReplyDeleteWelcome to Joho Tek digital transformation company in Canada, Get the best software development & cloud solutions in Vancouver. We provide technical support, network administration and systems monitoring, among other value-added services, to keep your business running smoothly.
to get more - https://www.johotek.com/