Inspired from https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/
cat txt/collated_books.txt | \
tr '\n' ' ' | \
sed "s,', ,g" | \
sed "s,_,,g" | \
sed 's,[ ]\+, ,g' | \
sed "s,^ ,,g" > txt/collated_books_stripped.txt
conda create -n keras python=3.6 tensorflow keras nltk
conda activate keras
import numpy as np
import tensorflow as tf
1)
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(import keras
from pickle import dump
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Embedding
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WordPunctTokenizer
file = open('txt/collated_books_stripped.txt', 'r')
= file.read().lower()
doc file.close()
= RegexpTokenizer('\w+|\.|,')
tokenizer = tokenizer.tokenize(doc)
tokens print(tokens[:20])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
# organize into sequences of tokens
= 50 + 1
length = list()
sequences for i in range(length, len(tokens)):
= tokens[i-length:i]
seq = ' '.join(seq)
line
sequences.append(line)
= '\n'.join(sequences)
data file = open('txt/collated_books_stripped_sequences.txt', 'w')
file.write(data)
file.close()
# Import the text
file = open('txt/collated_books_stripped_sequences.txt', 'r')
= file.read()
doc file.close()
= doc.split('\n')
lines
# Tokenize
= Tokenizer(filters = '!"#$%&()*+-/:;<=>?@[\]^_`{|}~\t\n')
tokenizer
tokenizer.fit_on_texts(lines)= tokenizer.texts_to_sequences(lines)
sequences = len(tokenizer.word_index) + 1
vocab_size
# Hot one encode
= np.array(sequences)
sequences = sequences[:,:-1], sequences[:,-1]
X, y = keras.utils.to_categorical(y, num_classes = vocab_size)
y = X.shape[1]
seq_length
# define model
= Sequential()
model = seq_length))
model.add(Embedding(vocab_size, seq_length, input_length 30))
model.add(LSTM(30, 100))
model.add(Embedding(0.15))
model.add(Dropout(='softmax'))
model.add(Dense(vocab_size, activationprint(model.summary())
# compile model
compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.
# fit model
= model.fit(X, y, batch_size = 256, epochs = 10)
history
# save the model to file
'model.h5')
model.save(
# save the tokenizer
open('tokenizer.pkl', 'wb')) dump(tokenizer,
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
= list()
result = seed_text
in_text # generate a fixed number of words
for _ in range(n_words):
# encode the text as integer
= tokenizer.texts_to_sequences([in_text])[0]
encoded # truncate sequences to a fixed length
= pad_sequences([encoded], maxlen=seq_length, truncating='pre')
encoded # predict probabilities for each word
= model.predict_classes(encoded, verbose=0)
yhat # map predicted word index to word
= ''
out_word for word, index in tokenizer.word_index.items():
if index == yhat:
= word
out_word break
# append to input
+= ' ' + out_word
in_text
result.append(out_word)return ' '.join(result)
# load the model
= load_model('model.h5')
model
# load the tokenizer
= load(open('tokenizer.pkl', 'rb'))
tokenizer
# generate new text
= 'ever since his return , frodo did not know this when gandalf arrived . But the mountains were close and pippin could not abandon merry there '
seed_text = generate_seq(model, tokenizer, 100, seed_text, 100)
generated + '///' + generated seed_text