import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import tensorflow as tf
import tensorflow.keras as K
tf.__version__
K.__version__ # tf means Tensorflow Backend
data = K.datasets.imdb
(train_text, train_labels), (val_text, val_labels) = data.load_data(num_words=20000)
train_text.shape, train_labels.shape # Notice Text Shape don't include Seq. Length (In List Form - Unstandardized Lengths)
val_text.shape, val_labels.shape
print(train_text[0]) # Train Data is in Numbers (Each number maps to a word)
print(train_labels[:10]) # Train Labels are Binary (Positive or Negative Review)
word_index = data.get_word_index()
len(word_index)
word_index
word_index = {k: (v+2) for (k,v) in word_index.items()}
word_index["<PAD>"] = 0 # Used to fill sentences to make Sequence Lengths the same
word_index["<START>"] = 1 # To show the start of a sequence
word_index["UNK"] = 2 # Used to fill in the gap for unknown words
num_to_word = {v: k for (k,v) in word_index.items()}
for i in range(10):
print(num_to_word[i+1])
def decode(numbers):
sentence = ""
for number in numbers:
sentence += num_to_word[number] + " "
return sentence
decode(train_text[0])
decode(train_text[1]) # Contains words like 'BEST' and 'GOOD', but we can easily tell this is a negative review
decode(train_text[2])
train_data = K.preprocessing.sequence.pad_sequences(train_text, value=0, padding='post', maxlen=256)
decode(train_data[0])
decode(train_data[1])
decode(train_data[2])
train_data.shape
val_data = K.preprocessing.sequence.pad_sequences(val_text, value=0, padding='post', maxlen=256)
Word2Vec, GloVe, ELMo, Tf-idf
model = K.Sequential([
K.layers.Embedding(len(word_index), 8),
K.layers.GlobalAveragePooling1D(), # Averaging the Sentiment across the Embedding Dimensions
K.layers.Dense(32, activation='relu'),
K.layers.Dense(16, activation='relu'),
K.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_data, train_labels, epochs=5, batch_size=128)
model.evaluate(val_data, val_labels)
model.summary()
rec_model = K.Sequential([
K.layers.Embedding(len(word_index), 8),
K.layers.SimpleRNN(4, return_sequences=False), # Don't Specify Activation Layer - Why?
K.layers.Dense(32, activation='relu'),
K.layers.Dense(16, activation='relu'),
K.layers.Dense(1, activation='sigmoid')
])
rec_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rec_model.fit(train_data, train_labels, epochs=5, batch_size=128)
rec_model.evaluate(val_data, val_labels)
rec_model.summary()
gru_model = K.Sequential([
K.layers.Embedding(len(word_index), 8),
K.layers.GRU(4, return_sequences=False), # Don't Specify Activation Layer - Why?
K.layers.Dense(32, activation='relu'),
K.layers.Dense(16, activation='relu'),
K.layers.Dense(1, activation='sigmoid')
])
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model.fit(train_data, train_labels, epochs=5, batch_size=128)
gru_model.evaluate(val_data, val_labels)
gru_model.summary()
lstm_model = K.Sequential([
K.layers.Embedding(len(word_index), 8),
K.layers.LSTM(4, return_sequences=False), # Don't Specify Activation Layer - Why?
K.layers.Dense(32, activation='relu'),
K.layers.Dense(16, activation='relu'),
K.layers.Dense(1, activation='sigmoid')
])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(train_data, train_labels, epochs=5, batch_size=128)
lstm_model.evaluate(val_data, val_labels)
lstm_model.summary()
gru_model.predict([[val_data[0]]])
decode(val_text[0])
print('Models Predicted:',
model.predict([[val_data[0]]]),
rec_model.predict([[val_data[0]]]),
gru_model.predict([[val_data[0]]]),
lstm_model.predict([[val_data[0]]]))
print('True Value:',val_labels[0])
decode(val_text[1])
print('Models Predicted:',
model.predict([[val_data[1]]]),
rec_model.predict([[val_data[1]]]),
gru_model.predict([[val_data[1]]]),
lstm_model.predict([[val_data[1]]]))
print('True Value:',val_labels[1])
decode(val_text[2])
print('Models Predicted:',
model.predict([[val_data[2]]]),
rec_model.predict([[val_data[2]]]),
gru_model.predict([[val_data[2]]]),
lstm_model.predict([[val_data[2]]]))
print('True Value:',val_labels[2])
def encode(sentence):
words = sentence.lower().split()
numbers = []
for word in words:
try:
numbers.append(word_index[word])
except KeyError:
numbers.append(2)
if len(numbers) < 256:
numbers += ([0] * (256 - len(numbers)))
return numbers
my_sentence = encode('Things were good at the start but it only got worse, even though i still enjoyed the movie')
print(model.predict([[my_sentence]]),
rec_model.predict([[my_sentence]]),
gru_model.predict([[my_sentence]]),
lstm_model.predict([[my_sentence]]))
shopee_data = pd.read_csv('./sources/shopee_beauty_data.csv', index_col=0)
shopee_data.head()
data = shopee_data[['title', 'Product_texture']].dropna()
data.head()
X = data['title']
Y = data['Product_texture']
import json
with open('./sources/beauty_profile_train.json') as f:
beauty_profiles = json.load(f)
class_names = [pair[0] for pair in sorted(beauty_profiles['Product_texture'].items(), key=lambda x: x[1])]
num_classes = len(class_names)
print(class_names)
tokenizer = K.preprocessing.text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X)
word_index = {k: v+2 for k,v in tokenizer.word_index.items()}
word_index["<PAD>"] = 0 # Used to fill sentences to make Sequence Lengths the same
word_index["<START>"] = 1 # To show the start of a sequence
word_index["UNK"] = 2 # Used to fill in the gap for unknown words
int_data = data['title'].apply(lambda x: [1] + [word_index.get(xi, 2) for xi in x.split()])
padded_data = K.preprocessing.sequence.pad_sequences(int_data, value=0, padding='post', maxlen=30)
print(padded_data)
print(padded_data[0])
num_to_word = {v: k for (k,v) in word_index.items()}
print(decode(padded_data[0]))
print(decode(padded_data[1]))
padded_data.shape
split_ratio = 0.2
split_idx = int(split_ratio*len(padded_data))
X_train = padded_data[split_idx:]
Y_train = Y[split_idx:]
X_val = padded_data[:split_idx]
Y_val = Y[:split_idx]
gru_model = K.Sequential([
K.layers.Embedding(len(word_index), 8),
K.layers.GRU(4, return_sequences=False),
K.layers.Dense(32, activation='relu'),
K.layers.Dense(16, activation='relu'),
K.layers.Dense(num_classes, activation='softmax')
])
gru_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
gru_model.fit(X_train, Y_train, epochs=3, batch_size=64)
gru_model.evaluate(X_val, Y_val)
class_names
preds = gru_model.predict(X_val)
class_preds = np.argmax(preds,1)
val_text = data['title'].iloc[:split_idx]
for i in range(20):
print(val_text.iloc[i])
print('True Value: {} | Predicted: {}'.format(class_names[int(Y_val.iloc[i])], class_names[class_preds[i]]))
print()
def predictor(text):
int_data = [1] + [word_index.get(xi, 2) for xi in text.lower().split()]
padded_data = K.preprocessing.sequence.pad_sequences([int_data], value=0, padding='post', maxlen=30)
pred = gru_model.predict(padded_data)
idx = np.argmax(pred)
class_pred = class_names[idx]
return class_pred
print(class_names)
text = "suss special invincible amazing super delicious unbelievable wet jelly of immortality"
predictor(text)
text = "dijamin terlihat lebih muda dan lebih indah bubuk super luar biasa dengan aditif kecantikan"
predictor(text)
text = "dijamin terlihat lebih muda dan mousse super luar biasa lebih indah dengan aditif kecantikan"
predictor(text)