# Sentiment classification: sklearn models

In [125]:
%load_ext aicrowd.magic

The aicrowd.magic extension is already loaded. To reload it, use:
  %reload_ext aicrowd.magic


In [126]:
%aicrowd login

Please login here: [34m[1m[4mhttps://api.aicrowd.com/auth/4K4R3TM3Wilf3Wb0YYutWzKRpuB4g9ikjR0pSKwpG34[0m
[32mAPI Key valid[0m
[32mGitlab access token valid[0m
[32mSaved details successfully![0m


In [127]:
# %aicrowd ds dl -c sentiment-classification -o data

## Imports

In [128]:
import os

from ast import literal_eval
from collections import Counter
import pandas as pd
import neptune.new as neptune
import numpy as np
from scipy.stats import uniform
import optuna
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [129]:
NEPTUNE_PROJECT = "deepsense-ai/AIcrowd"
NEPTUNE_API = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0NWE5MTZhNi0yMDE3LTQ3N2EtOGMwOS1lZGY1YjRiOWJlYmUifQ=="

## Data

In [130]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")

In [131]:
train_df.head()

Unnamed: 0,embeddings,label
0,"[0.3206779360771179, 0.988215982913971, 1.0441...",positive
1,"[0.05074610561132431, 1.0742985010147095, 0.60...",negative
2,"[0.41962647438049316, 0.4505457878112793, 1.39...",negative
3,"[0.4361684024333954, 0.19191382825374603, 0.83...",positive
4,"[0.6382085084915161, 0.8352395296096802, 0.393...",neutral


In [132]:
train_df.label.value_counts()

neutral     1694
positive    1684
negative    1622
Name: label, dtype: int64

In [133]:
X_train = [literal_eval(embedding)  for embedding in train_df['embeddings'].values]
y_train = train_df['label'].values

X_valid = [literal_eval(embedding)  for embedding in val_df['embeddings'].values]
y_valid = val_df['label'].values

X_test = [literal_eval(embedding)  for embedding in test_df['embeddings'].values]

## Standard Scaler

In [134]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

## Submission

In [135]:
def make_submission(y_test_pred):
    submission = pd.DataFrame(
        {
            "embeddings": X_test,
            "label": y_test_pred,
        }
    )
    submission.to_csv(os.path.join("assets", "submission.csv"))
    %aicrowd notebook submit -c sentiment-classification -a assets --no-verify


## Sklearn models

In [136]:
def objective(trial):
    global best_score
    
    # optional PCA
    X_train_scaled_reduced = X_train_scaled
    X_valid_scaled_reduced = X_valid_scaled
    X_test_scaled_reduced = X_test_scaled
    num_dim = 512
    reduce_dim = trial.suggest_categorical("reduce_dim", [False, True])
    if reduce_dim:
        num_dim = trial.suggest_int("num_dim", 32, 512)
        pca = PCA(n_components=num_dim)
        X_train_scaled_reduced = pca.fit_transform(X_train_scaled)
        X_valid_scaled_reduced = pca.transform(X_valid_scaled)
        X_test_scaled_reduced = pca.transform(X_test_scaled)
        
    
    classifier_name = trial.suggest_categorical('classifier', ['mlp', 'svc'])  # 'knn'
    if classifier_name == 'svc':
        svc_c = trial.suggest_int('svc_c', 1, 1e8)
        svc_degree = trial.suggest_int('svc_degree', 2, 11)
        svc_gamma = trial.suggest_float('svc_gamma', 1e-10, 1e1)
        params = {
            "svc_c": svc_c,
            "svc_degree": svc_degree,
            "svc_gamma": svc_gamma,
        }
        classifier = SVC(
            C=svc_c,
            degree=svc_degree,
            gamma=svc_gamma,  # 'auto', 
            random_state=42,
        )
    elif classifier_name == 'knn':
        knn_neighbors = trial.suggest_int('knn_neighbors', 1, 21)
        params = {
            "knn_neighbors": knn_neighbors,
        }
        classifier = KNeighborsClassifier(
            n_neighbors=knn_neighbors,
        )
    elif classifier_name == 'mlp':
        mlp_alpha = trial.suggest_float('mlp_alpha', 1e-10, 10)
        mlp_hidden_layer_sizes = trial.suggest_int('mlp_hidden_layer_sizes', 128, 1024)
        mlp_validation_fraction = trial.suggest_float('mlp_validation_fraction', 0.01, 0.2)
        params = {
            "mlp_alpha": mlp_alpha,
            "mlp_hidden_layer_sizes": mlp_hidden_layer_sizes,
            "mlp_validation_fraction": mlp_validation_fraction,
        }
        classifier = MLPClassifier(
            alpha = mlp_alpha,
            hidden_layer_sizes = mlp_hidden_layer_sizes,
            early_stopping = True,
            n_iter_no_change = 100,
            max_iter = 1000,
            validation_fraction = mlp_validation_fraction,
            random_state=42,
        )
    else:
        raise Exception("Wrong classifier name") 
    
    classifier = classifier.fit(X_train_scaled_reduced, y_train)
    valid_accuracy = classifier.score(X_valid_scaled_reduced, y_valid)
    if valid_accuracy > best_score:
        print("SUBMISION, valid/acc:", valid_accuracy)
        best_score = valid_accuracy
        run = neptune.init(
            project=NEPTUNE_PROJECT,
            api_token=NEPTUNE_API,
            tags=["sentiment_classification", "sklearn", "optuna"]
        )
        run["model"] = classifier_name
        run["parameters"] = params
        run["reduce_dim"] = reduce_dim
        run["num_dim"] = num_dim
        run["train/acc"] = classifier.score(X_train_scaled_reduced, y_train)
        run["valid/acc"] = valid_accuracy
        run.stop()
        
        y_test_pred = classifier.predict(X_test_scaled)
        make_submission(y_test_pred)
        
    return valid_accuracy


In [137]:
best_score = 0.795

In [63]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[32m[I 2022-02-19 11:17:50,888][0m A new study created in memory with name: no-name-f36c719d-a90e-48c5-b46e-480ac04b6a2e[0m
[32m[I 2022-02-19 11:19:33,077][0m Trial 0 finished with value: 0.776 and parameters: {'reduce_dim': True, 'num_dim': 421, 'classifier': 'mlp', 'mlp_alpha': 2.5019245021107923, 'mlp_hidden_layer_sizes': 900, 'mlp_validation_fraction': 0.16645067204494246}. Best is trial 0 with value: 0.776.[0m
[32m[I 2022-02-19 11:20:43,648][0m Trial 1 finished with value: 0.752 and parameters: {'reduce_dim': True, 'num_dim': 83, 'classifier': 'mlp', 'mlp_alpha': 6.61428838340386, 'mlp_hidden_layer_sizes': 911, 'mlp_validation_fraction': 0.12969498331407583}. Best is trial 0 with value: 0.776.[0m
[32m[I 2022-02-19 11:23:11,791][0m Trial 2 finished with value: 0.764 and parameters: {'reduce_dim': True, 'num_dim': 420, 'classifier': 'mlp', 'mlp_alpha': 3.22735189916378, 'mlp_hidden_layer_sizes': 762, 'mlp_validation_fraction': 0.13358083106439148}. Best is trial 0 with va

In [14]:
study.best_params

{'classifier': 'mlp',
 'mlp_alpha': 0.2017750141364524,
 'mlp_hidden_layer_sizes': 455,
 'mlp_validation_fraction': 0.0291592386774798}

## Sklearn models with crossvalidation

In [138]:
train_valid_df = pd.concat([train_df, val_df]) # concat the train and validation set, we will be using the k fold method later
X_train_valid = [literal_eval(embedding) for embedding in train_valid_df['embeddings'].values]
y_train_valid = train_valid_df['label'].values

In [139]:
X_train_valid_scaled = scaler.transform(X_train_valid)

In [140]:
Fold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

In [141]:
def objective_cv(trial):
    global best_score
    params = {
        "alpha": trial.suggest_float('alpha', 1e-10, 10),
        "hidden_layer_sizes": trial.suggest_int('hidden_layer_sizes', 128, 1024),
        "validation_fraction": trial.suggest_float('mlp_validation_fraction', 0.01, 0.2),
    }
    
    # optional PCA
    X_train_valid_scaled_reduced = X_train_valid_scaled
    X_test_scaled_reduced = X_test_scaled
    num_dim = 512
    reduce_dim = trial.suggest_categorical("reduce_dim", [False, True])
    if reduce_dim:
        num_dim = trial.suggest_int("num_dim", 32, 512)
        pca = PCA(n_components=num_dim)
        X_train_valid_scaled_reduced = pca.fit_transform(X_train_valid_scaled)
        X_test_scaled_reduced = pca.transform(X_test_scaled)
        
    # kfold
    f1_scores = []
    models = []
    for n, (trn_, val_) in tqdm(enumerate(Fold.split(X_train_valid_scaled_reduced, y_train_valid))):
        
        fold_train_data = X_train_valid_scaled_reduced[trn_]
        fold_valid_data = X_train_valid_scaled_reduced[val_]
        
        fold_train_labels = y_train_valid[trn_]
        fold_valid_labels = y_train_valid[val_]
        model = MLPClassifier(**params)
        model.fit(fold_train_data, fold_train_labels)
        models.append(model)
        
        valid_pred = model.predict(fold_valid_data)
        f1 = f1_score(fold_valid_labels, valid_pred, average ='weighted')
        f1_scores.append(f1)

    mean_valid_f1 = np.mean(f1_scores)
    
    # neptune and submission
    if mean_valid_f1 > best_score:
        print("SUBMISION, mean_valid_f1:", mean_valid_f1)
#         best_score = mean_valid_f1
        run = neptune.init(
            project=NEPTUNE_PROJECT,
            api_token=NEPTUNE_API,
            tags=["sentiment_classification", "mlp", "optuna", "crossval"]
        )
        run["model"] = "mlp"
        run["parameters"] = params
        run["reduce_dim"] = reduce_dim
        run["num_dim"] = num_dim
        run["mean_valid_f1"] = mean_valid_f1
        run.stop()
        
        predictions = []
        for model in models:
            predictions.append(model.predict(X_test_scaled_reduced))
        
        y_test_pred = [
            Counter([pred[i] for pred in predictions]).most_common(1)[0][0]
            for i in range(len(X_test_scaled_reduced))
        ]
            
        make_submission(y_test_pred)

    return mean_valid_f1


In [142]:
best_score = 0.795

In [None]:
study_cv = optuna.create_study(direction='maximize')
study_cv.optimize(objective_cv, n_trials=50)

[32m[I 2022-02-21 10:26:27,069][0m A new study created in memory with name: no-name-c6379eb7-37d3-42d1-b645-cd605a0f3c5e[0m
5it [01:11, 14.34s/it]
[32m[I 2022-02-21 10:27:38,776][0m Trial 0 finished with value: 0.7875312104530361 and parameters: {'alpha': 1.245653185407123, 'hidden_layer_sizes': 219, 'mlp_validation_fraction': 0.04071282192517211, 'reduce_dim': False}. Best is trial 0 with value: 0.7875312104530361.[0m
5it [00:51, 10.23s/it]
[32m[I 2022-02-21 10:28:29,951][0m Trial 1 finished with value: 0.7818984547115877 and parameters: {'alpha': 8.622312541470208, 'hidden_layer_sizes': 139, 'mlp_validation_fraction': 0.18180375199538013, 'reduce_dim': False}. Best is trial 0 with value: 0.7875312104530361.[0m
5it [02:39, 31.86s/it]
[32m[I 2022-02-21 10:31:09,257][0m Trial 2 finished with value: 0.7864756068363976 and parameters: {'alpha': 6.396572349825316, 'hidden_layer_sizes': 518, 'mlp_validation_fraction': 0.011601681069209327, 'reduce_dim': False}. Best is trial 0 wi

SUBMISION, mean_valid_f1: 0.7995572669773718
https://app.neptune.ai/deepsense-ai/AIcrowd/e/AIC-210
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 43 operations to synchronize with Neptune. Do not kill this process.


All 43 operations synced, thanks for waiting!


<IPython.core.display.Javascript object>

Using notebook: sklearn_models.ipynb for submission...
Removing existing files from submission directory...
Scrubbing API keys from the notebook...
Collecting notebook...


Output()

In [336]:
classifier = SVC(
    C=20,
    degree=5,
    max_iter=-1,
#     break_ties=True,
#     tol=0.00001,
#     probability=True,
    gamma=0.0004,
    random_state=42,
).fit(X_train_scaled, y_train)

In [337]:
classifier.score(X_train_scaled, y_train)

0.972

In [338]:
classifier.score(X_valid_scaled, y_valid)

0.8065

### Random Forest

In [105]:
classifier = RandomForestClassifier(
    n_estimators = 200,
    max_depth = 50,
    min_samples_leaf = 20,
).fit(X_train_scaled, y_train)

In [106]:
classifier.score(X_train_scaled, y_train)

0.8712

In [107]:
classifier.score(X_valid_scaled, y_valid)

0.6815

In [117]:
predictions = []
for model in [classifier, classifier, classifier]:
    predictions.append(model.predict(X_valid_scaled))

    y_test_pred = [
        Counter([pred[i] for pred in predictions]).most_common(1)[0][0]
        for i in range(len(X_valid_scaled))
    ]

In [120]:
# classifier.score(X_valid_scaled, y_test_pred) 

### Neural Network

In [19]:
X_train_valid_scaled = np.concatenate([X_train_scaled, X_valid_scaled])
y_train_valid = np.concatenate([y_train, y_valid])

In [23]:
classifier = MLPClassifier(
    alpha = 0.5,
    hidden_layer_sizes = 455,
    early_stopping = True,
    n_iter_no_change = 100,
    max_iter = 1000,
    validation_fraction = 0.02,
    random_state=42,
#     verbose =  True,
# ).fit(X_train_scaled, y_train)
).fit(X_train_valid_scaled, y_train_valid)



In [21]:
classifier.score(X_train_scaled, y_train)

0.986

In [22]:
classifier.score(X_valid_scaled, y_valid)

0.992

## Validation

In [25]:
y_valid_pred = classifier.predict(X_valid_scaled)
print(classification_report(y_valid, y_valid_pred))

              precision    recall  f1-score   support

    negative       0.99      0.99      0.99       640
     neutral       0.99      0.99      0.99       633
    positive       1.00      1.00      1.00       727

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000



In [26]:
y_test_pred = classifier.predict(X_test_scaled)

In [27]:
submission = pd.DataFrame(
    {
        "embeddings": X_test_scaled.tolist(),
        "label": y_test_pred,
    }
)

## Submission

In [28]:
submission.to_csv(os.path.join("assets", "submission.csv"))

In [29]:
%aicrowd notebook submit -c sentiment-classification -a assets --no-verify

<IPython.core.display.Javascript object>

Using notebook: sklearn_models.ipynb for submission...
Removing existing files from submission directory...
Scrubbing API keys from the notebook...
Collecting notebook...


Output()

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

