import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Lambda
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras import losses
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ReduceLROnPlateau
#import tensorflow_hub as hub
#import tensorflow_text as text # Bert preprocess uses this
from tensorflow.keras.optimizers import Adam
import re
import nltk
from nltk.corpus import stopwords
import string
from gensim.models import KeyedVectors
#nltk.download('stopwords')
Goal
I’m learning NLP. So to do that I decided pass trough diverse NLP models, study the teory and code them! I think that it is a good method to learn Machine Learning things. So, Disclaimer: All the content on the notebooks is what I understood from diverse references (I will put the links), somethings could be wrong, If you find any mistake please let me know, so I can learn of it. Also, if you have some doubt, it will be a pleasure to me to answer it (as long as I have the answer).
At the end, I achieve a score of 0.843 in the LB. Is beatifull to see how you are improving the solutions step by step!
So, this will be the embedding Notebook, I will put the link to each specific notebook here (So it will be more readable).
Methodologies & Notebooks:
Model Notebook | Score |
---|---|
Simple Neural Network ( View on kaggle) | 0.56 |
Embeddings ( View on kaggle) | 0.797 |
Recurrent Neural Networks ( View on kaggle) | 0.809 |
BERT & HuggingFace ( View on kaggle) | 0.824 |
MyBestSolution ( View on kaggle) | 0.843 |
EDA
Here I will do some preprocessing and split the data. I will use that data to each notebook!
I think there is a lot of notebooks with a beatifull EDA, So I won’t take to much around this.
Libraries
Data
= pd.read_csv("../input/nlp-getting-started/train.csv")
train_df train_df.head()
id | keyword | location | text | target | |
---|---|---|---|---|---|
0 | 1 | NaN | NaN | Our Deeds are the Reason of this #earthquake M... | 1 |
1 | 4 | NaN | NaN | Forest fire near La Ronge Sask. Canada | 1 |
2 | 5 | NaN | NaN | All residents asked to 'shelter in place' are ... | 1 |
3 | 6 | NaN | NaN | 13,000 people receive #wildfires evacuation or... | 1 |
4 | 7 | NaN | NaN | Just got sent this photo from Ruby #Alaska as ... | 1 |
= pd.read_csv("../input/nlp-getting-started/test.csv")
test_df test_df.head()
id | keyword | location | text | |
---|---|---|---|---|
0 | 0 | NaN | NaN | Just happened a terrible car crash |
1 | 2 | NaN | NaN | Heard about #earthquake is different cities, s... |
2 | 3 | NaN | NaN | there is a forest fire at spot pond, geese are... |
3 | 9 | NaN | NaN | Apocalypse lighting. #Spokane #wildfires |
4 | 11 | NaN | NaN | Typhoon Soudelor kills 28 in China and Taiwan |
EDA
# Target Proportion
=train_df, x = "target") sns.countplot(data
<AxesSubplot:xlabel='target', ylabel='count'>
I think it is balanced!
# Random example of disaster tweet
== 1].sample(1).text.values[0] train_df[train_df.target
'California Bush fires please evacuate affected areas ASAP when california govts advised you to do so http://t.co/ubVEVUuAch'
# Random example of NO disaster tweet
== 0].sample(1).text.values[0] train_df[train_df.target
'Someone split a mudslide w me when I get off work'
Pre-processing
I will do some preprocessing with Tensorflow!
# Input Tensor Data
= tf.data.Dataset.from_tensor_slices(train_df.text)
text text
2022-12-11 15:20:58.687517: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
<TensorSliceDataset shapes: (), types: tf.string>
Note that Im reading data from memory! If it would huge data I would be in troubles!
One advantage of initialize a Tensorflow dataset is that I will be able to create a data pipeline (batch, fetch, shuffle, etc.)
# some samples
list(text.take(2).as_numpy_iterator())
[b'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
b'Forest fire near La Ronge Sask. Canada']
We need to know that models don’t understand text by itself! Just numbers! For this, we vectorize the sentences. I don’t plan to use a model now, I would like to observe wich words are more present by target! (Also I don’t want to consider stopwords)
I will use the tensorflow layer: Text Vectorization. from behind, it apply lowercase and delete punctuation. I also wants to remove stopwords, so I will build a custom standarization that do: 1. lowercase 2. strip punctuation 3. remove stop words!
Click here if you don’t know what are stop words
#### COUNT WORDS BY TARGET
def custom_standardization(inputs):
"""
Apply: lowercase, remove punctuation and stopwords
"""
= r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']'
PUNCTUATION = tf.strings.lower(inputs) # lowercase
lowercase = tf.strings.regex_replace(lowercase, PUNCTUATION, '') # strip punctuation
strip = tf.strings.regex_replace(strip, r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*', '')
stopwrd return stopwrd
# model to apply vectorize_layer with custom standardization
= tf.keras.layers.TextVectorization(output_mode = 'multi_hot', standardize = custom_standardization)
vectorize_layer
vectorize_layer.adapt(text)
# model to vectorize
= tf.keras.models.Sequential()
model =(1,), dtype=tf.string))
model.add(tf.keras.Input(shape
model.add(vectorize_layer)
# make counter
= model.predict(text.batch(batch_size = len(text))) # predict to count
train_count = pd.DataFrame(columns = vectorize_layer.get_vocabulary(), data = train_count) # df with tokens and count
token_counts = {"target":"disaster_target"}, inplace = True) # rename target because there is a word target in data
train_df.rename(columns = pd.concat([train_df, token_counts], axis = 1) #concat
count_df = count_df.iloc[:,4:].groupby("disaster_target", as_index = False).sum() # count token for each target
group_count = pd.melt(group_count, id_vars=["disaster_target"], value_name = "count") # each token to row
melt_count =["count"], ascending = False).head(30) melt_count.sort_values(by
2022-12-11 15:20:58.997911: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3552: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resulting Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name.
exec(code_obj, self.user_global_ns, self.user_ns)
disaster_target | variable | count | |
---|---|---|---|
2 | 0 | like | 239.0 |
4 | 0 | im | 221.0 |
6 | 0 | amp | 174.0 |
12 | 0 | new | 163.0 |
9 | 1 | fire | 162.0 |
10 | 0 | get | 158.0 |
22 | 0 | dont | 136.0 |
21 | 1 | news | 130.0 |
18 | 0 | one | 122.0 |
15 | 1 | via | 121.0 |
42 | 0 | body | 110.0 |
51 | 1 | california | 108.0 |
53 | 1 | suicide | 104.0 |
17 | 1 | people | 101.0 |
37 | 1 | police | 97.0 |
35 | 1 | disaster | 96.0 |
14 | 0 | via | 96.0 |
7 | 1 | amp | 95.0 |
38 | 0 | would | 93.0 |
95 | 1 | killed | 90.0 |
24 | 0 | video | 90.0 |
16 | 0 | people | 90.0 |
3 | 1 | like | 88.0 |
117 | 1 | hiroshima | 84.0 |
87 | 1 | fires | 82.0 |
62 | 0 | know | 82.0 |
28 | 0 | 2 | 81.0 |
104 | 0 | full | 81.0 |
84 | 0 | love | 81.0 |
58 | 0 | time | 80.0 |
After I build it I realized that it is not necessary to instantiate a model to use layers! 😅
Split Data
from sklearn.model_selection import train_test_split
= train_test_split(train_df[[col for col in train_df.columns if col != 'disaster_target']], train_df.disaster_target, test_size=0.2, random_state=13) X_train, X_test, y_train, y_test
# To csv (In some notebooks I will use this data)
= 1).to_csv('df_train.csv',index = False)
pd.concat([X_train, y_train], axis = 1).to_csv('df_test.csv',index = False) pd.concat([X_test, y_test], axis
This dataset is here: https://www.kaggle.com/datasets/diegomachado/df-split
# Delete it from memory
del train_df, test_df, X_train, X_test, y_train, y_test
gc.collect()
671