Inspired from https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/
cat txt/collated_books.txt | \
tr '\n' ' ' | \
sed "s,', ,g" | \
sed "s,_,,g" | \
sed 's,[ ]\+, ,g' | \
sed "s,^ ,,g" > txt/collated_books_stripped.txtconda create -n keras python=3.6 tensorflow keras nltk
conda activate kerasimport numpy as np
import tensorflow as tf
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)
import keras
from pickle import dump
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Embedding
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WordPunctTokenizerfile = open('txt/collated_books_stripped.txt', 'r')
doc = file.read().lower()
file.close()
tokenizer = RegexpTokenizer('\w+|\.|,')
tokens = tokenizer.tokenize(doc)
print(tokens[:20])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
seq = tokens[i-length:i]
line = ' '.join(seq)
sequences.append(line)
data = '\n'.join(sequences)
file = open('txt/collated_books_stripped_sequences.txt', 'w')
file.write(data)
file.close()# Import the text
file = open('txt/collated_books_stripped_sequences.txt', 'r')
doc = file.read()
file.close()
lines = doc.split('\n')
# Tokenize
tokenizer = Tokenizer(filters = '!"#$%&()*+-/:;<=>?@[\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1
# Hot one encode
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = keras.utils.to_categorical(y, num_classes = vocab_size)
seq_length = X.shape[1]
# define model
model = Sequential()
model.add(Embedding(vocab_size, seq_length, input_length = seq_length))
model.add(LSTM(30))
model.add(Embedding(30, 100))
model.add(Dropout(0.15))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
history = model.fit(X, y, batch_size = 256, epochs = 10)
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
result = list()
in_text = seed_text
# generate a fixed number of words
for _ in range(n_words):
# encode the text as integer
encoded = tokenizer.texts_to_sequences([in_text])[0]
# truncate sequences to a fixed length
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
# predict probabilities for each word
yhat = model.predict_classes(encoded, verbose=0)
# map predicted word index to word
out_word = ''
for word, index in tokenizer.word_index.items():
if index == yhat:
out_word = word
break
# append to input
in_text += ' ' + out_word
result.append(out_word)
return ' '.join(result)
# load the model
model = load_model('model.h5')
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
# generate new text
seed_text = 'ever since his return , frodo did not know this when gandalf arrived . But the mountains were close and pippin could not abandon merry there '
generated = generate_seq(model, tokenizer, 100, seed_text, 100)
seed_text + '///' + generated