Machine Learning and Natural Langage Processing

PUBLISHED ON JUN 13, 2020 — NLP, R, TEXT MINING
Post Image

Inspired from https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

Clean-up data and select Results chapters

cat txt/collated_books.txt | \
    tr '\n' ' ' | \
    sed "s,', ,g" | \
    sed "s,_,,g" | \
    sed 's,[ ]\+, ,g' | \
    sed "s,^ ,,g" > txt/collated_books_stripped.txt

Create a new conda env.

conda create -n keras python=3.6 tensorflow keras nltk
conda activate keras

Start python

import numpy as np
import tensorflow as tf
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)
import keras
from pickle import dump
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Embedding
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WordPunctTokenizer

Import and clean-up data

file = open('txt/collated_books_stripped.txt', 'r')
doc = file.read().lower()
file.close()
tokenizer = RegexpTokenizer('\w+|\.|,')
tokens = tokenizer.tokenize(doc)
print(tokens[:20])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)

data = '\n'.join(sequences)
file = open('txt/collated_books_stripped_sequences.txt', 'w')
file.write(data)
file.close()

Start working with the data

# Import the text
file = open('txt/collated_books_stripped_sequences.txt', 'r')
doc = file.read()
file.close()
lines = doc.split('\n')

# Tokenize
tokenizer = Tokenizer(filters = '!"#$%&()*+-/:;<=>?@[\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1

# Hot one encode
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = keras.utils.to_categorical(y, num_classes = vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, seq_length, input_length = seq_length))
model.add(LSTM(30))
model.add(Embedding(30, 100))
model.add(Dropout(0.15))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit model
history = model.fit(X, y, batch_size = 256, epochs = 10)

# save the model to file
model.save('model.h5')

# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Predict text

from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# generate new text
seed_text = 'ever since his return , frodo did not know this when gandalf arrived . But the mountains were close and pippin could not abandon merry there '
generated = generate_seq(model, tokenizer, 100, seed_text, 100)
seed_text + '///' + generated
TAGS: NLP, R, TEXT MINING