import pandas as pd

amazon_text_df = pd.read_csv('minClean_amazon.csv')
amazon_text_df.head()


%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,5))
sns.countplot(x=amazon_text_df.Category, color='green')
plt.title('Amazon text class distribution', fontsize=16)
plt.ylabel('Class Counts', fontsize=16)
plt.xlabel('Class Label', fontsize=16)
plt.xticks(rotation='vertical');


amazon_text_df.drop(columns= ['Unnamed: 0', 'Link', 'Publish Time', 'Reply Times', 'Reply Authors'], inplace=True)


df = amazon_text_df.copy()


df['text'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment'] + ' ' + df['Reply Comments']


from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s


df.iloc[2,1]

'Fulfillment By Amazon'


clean_text(df.iloc[2,1])

'fulfil amazon'


%matplotlib inline

from wordcloud import WordCloud

def plot_word_cloud(text):
    wordcloud_instance = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords=None,
                min_font_size = 10).generate(text) 
             
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud_instance) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()


texts = ''
for index, item in df.iterrows():
    texts = texts + ' ' + clean_text(item['text'])
    
plot_word_cloud(texts)


df.rename(columns={"Category": "category"}, inplace=True)


def plot_word_cloud_for_category(df, category):
    text_df = df.loc[df['category'] == str(category)]
    texts = ''
    for index, item in text_df.iterrows():
        texts = texts + ' ' + clean_text(item['text'])
    
    plot_word_cloud(texts)


df.category.unique()

array(['Fulfillment By Amazon', 'Selling on Amazon',
       'Amazon Marketplace Web Service (MWS)',
       'Amazon Sponsored Products', 'Account Health', 'Global Selling',
       'Amazon Pay', 'Groups',
       'Health,Safety,Sustainability,Security & Compliance',
       'Login With Amazon', 'Amazon Custom'], dtype=object)


plot_word_cloud_for_category(df,'Fulfillment By Amazon')


plot_word_cloud_for_category(df,'Selling on Amazon')


plot_word_cloud_for_category(df,'Health,Safety,Sustainability,Security & Compliance')


df_x = df['text']
df_y = df['category']


from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row).split(), [index]) for index, row in enumerate(df_x)]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
                                     for index, row in enumerate(df_x)]))


doc2vec_trf = Doc2VecTransformer()
doc2vec_features = doc2vec_trf.fit(df_x).transform(df_x)
doc2vec_features

100%|██████████| 7882/7882 [00:00<00:00, 1566058.94it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1229069.23it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1261764.98it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1554132.39it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1631601.23it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1748440.03it/s]
100%|██████████| 7882/7882 [00:00<00:00, 2096885.97it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1651488.87it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1841139.68it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1581416.13it/s]
100%|██████████| 7882/7882 [00:00<00:00, 837712.96it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1828411.27it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1725173.73it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1696577.24it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1460999.83it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1979848.13it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1842884.45it/s]
100%|██████████| 7882/7882 [00:00<00:00, 2120558.31it/s]
100%|██████████| 7882/7882 [00:00<00:00, 2062866.85it/s]
100%|██████████| 7882/7882 [00:00<00:00, 1675595.75it/s]

matrix([[-0.00277872, -0.00501187,  0.00482856, ..., -0.0008983 ,
          0.00663877, -0.00025027],
        [ 0.00234055, -0.00532782,  0.00668937, ..., -0.00276364,
          0.00571931,  0.00571155],
        [ 0.00426091, -0.00314035, -0.0013386 , ...,  0.00154023,
         -0.00435641,  0.00448706],
        ...,
        [ 0.02304544,  0.1583538 , -0.05634788, ...,  0.05963844,
         -0.1528518 , -0.06593791],
        [-0.0044461 , -0.00192232,  0.00569911, ..., -0.00424831,
          0.00223549, -0.0003378 ],
        [-0.00429653, -0.00084236,  0.00440887, ..., -0.00040945,
          0.00469861, -0.00300767]], dtype=float32)


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

pl_log_reg = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
                             ('log_reg', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100))])
scores = cross_val_score(pl_log_reg, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for Logistic Regression: ', scores.mean())

100%|██████████| 6303/6303 [00:00<00:00, 2303851.69it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2435664.10it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1995222.50it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2332718.44it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2670913.13it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2559215.69it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2473724.91it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2250889.58it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2697071.83it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2193006.89it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2534435.64it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1661744.81it/s]
100%|██████████| 6303/6303 [00:00<00:00, 3106180.02it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2819012.38it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2712290.77it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2482319.07it/s]
100%|██████████| 6303/6303 [00:00<00:00, 950105.95it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1756357.83it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1204624.90it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1379210.04it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  "the coef_ did not converge", ConvergenceWarning)
100%|██████████| 6304/6304 [00:00<00:00, 2063598.88it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1545437.63it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1926617.05it/s]
100%|██████████| 6304/6304 [00:00<00:00, 979945.61it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1394929.70it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1944469.22it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1377919.25it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1333042.22it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1680173.63it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1746195.51it/s]
100%|██████████| 6304/6304 [00:00<00:00, 735674.92it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1623933.94it/s]
100%|██████████| 6304/6304 [00:00<00:00, 912163.81it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1613134.79it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1609403.64it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1676976.75it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1697758.60it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1627231.98it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1687465.21it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1688650.68it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  "the coef_ did not converge", ConvergenceWarning)
100%|██████████| 6306/6306 [00:00<00:00, 2755133.44it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2753699.22it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2436598.90it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2510372.15it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2537588.12it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2697254.85it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2460398.23it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2649432.14it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2546873.47it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2494509.20it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2433908.26it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2469356.83it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2362808.74it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2369582.60it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2703872.52it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2408859.84it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2439520.48it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2706362.53it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2750835.26it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2532728.24it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  "the coef_ did not converge", ConvergenceWarning)
100%|██████████| 6307/6307 [00:00<00:00, 2441934.40it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2576302.62it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1819233.57it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2049544.85it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2470440.36it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2581834.41it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2653573.61it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2815397.54it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2791923.52it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2452120.44it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2632186.60it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2724353.79it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2769997.42it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2847521.56it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1996488.70it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1926410.96it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2679375.60it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2342674.05it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2764786.30it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2717357.51it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  "the coef_ did not converge", ConvergenceWarning)
100%|██████████| 6308/6308 [00:00<00:00, 2287142.95it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2558273.99it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2702795.96it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2530381.56it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2687695.01it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2577715.28it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2559016.31it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2705006.61it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2395876.99it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2345746.04it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2514270.61it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2808371.68it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2176690.22it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2333951.10it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2744002.24it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2083773.30it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2416887.70it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2383144.45it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2520258.11it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2826372.14it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  "the coef_ did not converge", ConvergenceWarning)

Accuracy for Logistic Regression:  0.2959826923098424


from sklearn.ensemble import RandomForestClassifier

pl_random_forest = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
                                   ('random_forest', RandomForestClassifier())])
scores = cross_val_score(pl_random_forest, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for RandomForest : ', scores.mean())

100%|██████████| 6303/6303 [00:00<00:00, 2181785.76it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1786987.84it/s]
100%|██████████| 6303/6303 [00:00<00:00, 3061220.25it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2021463.38it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2512038.97it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1069921.81it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1687628.35it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2131476.10it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2452383.87it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2688022.18it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2330867.41it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2524271.76it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2595905.16it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2381040.99it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2127017.31it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2115782.16it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1683115.69it/s]
100%|██████████| 6303/6303 [00:00<00:00, 998402.44it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2239069.88it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2495205.11it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
100%|██████████| 6304/6304 [00:00<00:00, 2086725.00it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2829415.99it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2718298.80it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2130611.80it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2342597.01it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2012704.00it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1431403.88it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2404154.61it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1747349.49it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2086066.46it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2668910.11it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2262032.03it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2101318.64it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1781371.18it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2479918.63it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2867775.75it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2434928.85it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1869935.81it/s]
100%|██████████| 6304/6304 [00:00<00:00, 850517.64it/s]
100%|██████████| 6304/6304 [00:00<00:00, 1454794.63it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
100%|██████████| 6306/6306 [00:00<00:00, 2514429.23it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1850894.40it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2555980.00it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2049378.66it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2398375.14it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2603531.94it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2589766.08it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2672184.38it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2459711.80it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1798047.66it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2046682.74it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2859690.89it/s]
100%|██████████| 6306/6306 [00:00<00:00, 616590.85it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2104828.98it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2378746.38it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2941096.52it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2988619.32it/s]
100%|██████████| 6306/6306 [00:00<00:00, 3075855.45it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2299737.50it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2222815.45it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
100%|██████████| 6307/6307 [00:00<00:00, 2452120.44it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2297904.39it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2280078.89it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2786923.23it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2462621.05it/s]
100%|██████████| 6307/6307 [00:00<00:00, 3022218.13it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2819598.73it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2203721.70it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2513633.16it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2066516.31it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2216276.42it/s]
100%|██████████| 6307/6307 [00:00<00:00, 773267.33it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1731361.69it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2344750.52it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2163529.51it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1942965.50it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2569046.84it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1970904.14it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2047799.61it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2839574.42it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
100%|██████████| 6308/6308 [00:00<00:00, 2165466.49it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2567210.33it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2405898.85it/s]
100%|██████████| 6308/6308 [00:00<00:00, 448427.48it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2179559.24it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2166175.67it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2552597.17it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1807958.84it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2014747.92it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2218672.51it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2570452.70it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2478470.22it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2438270.17it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2429091.96it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1497321.43it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2001185.21it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1722504.53it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1561016.56it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1046957.76it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1659932.85it/s]
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

Accuracy for RandomForest :  0.22430165617671932


import xgboost as xgb

pl_xgb = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
                         ('xgboost', xgb.XGBClassifier(objective='multi:softmax'))])
scores = cross_val_score(pl_xgb, df_x, df_y, cv=5)
print('Accuracy for XGBoost Classifier : ', scores.mean())

100%|██████████| 6303/6303 [00:00<00:00, 1376481.21it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2049197.59it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2407494.59it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2462205.28it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1852736.57it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1289720.86it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1274610.58it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1028385.19it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2179447.49it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1854165.95it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2828967.16it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1495372.93it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1299611.55it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1319986.92it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1619002.89it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1973919.07it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2372280.88it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2448976.20it/s]
100%|██████████| 6303/6303 [00:00<00:00, 2348467.45it/s]
100%|██████████| 6303/6303 [00:00<00:00, 1592092.63it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2327748.25it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2446418.62it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2089528.40it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2446192.29it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2164093.34it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2529744.78it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2528777.01it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2188272.15it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2415134.49it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2784424.22it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2688995.47it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2676745.54it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2378633.72it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2239425.12it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2731215.00it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2412490.18it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2733756.45it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2445287.38it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2533866.07it/s]
100%|██████████| 6304/6304 [00:00<00:00, 2368194.57it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2300737.74it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1965028.31it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2264299.38it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2304948.24it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2221881.81it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2577400.22it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2625238.81it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1872382.91it/s]
100%|██████████| 6306/6306 [00:00<00:00, 949568.50it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1060006.45it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1798659.03it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1282949.22it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2499459.56it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2019337.38it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1901731.45it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2328691.76it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2341680.48it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2450595.85it/s]
100%|██████████| 6306/6306 [00:00<00:00, 1058945.47it/s]
100%|██████████| 6306/6306 [00:00<00:00, 2042100.14it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2591191.63it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2837138.07it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2846296.03it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2494904.77it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2105162.77it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1873210.26it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1499885.20it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2179213.72it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2828341.21it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2848441.40it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2537990.53it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2781063.43it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2897106.05it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2076085.02it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2133688.93it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1772664.70it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1661441.74it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1899165.43it/s]
100%|██████████| 6307/6307 [00:00<00:00, 1988983.11it/s]
100%|██████████| 6307/6307 [00:00<00:00, 2798125.17it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2586282.47it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2802422.37it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2459119.77it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2433784.35it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2290707.33it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1647426.50it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1721271.85it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2463011.51it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2183336.33it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1095646.42it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1416515.13it/s]
100%|██████████| 6308/6308 [00:00<00:00, 1971951.23it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2488494.13it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2495300.35it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2205540.98it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2327585.96it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2361870.17it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2593634.90it/s]
100%|██████████| 6308/6308 [00:00<00:00, 601953.67it/s]
100%|██████████| 6308/6308 [00:00<00:00, 2506173.12it/s]

Accuracy for XGBoost Classifier :  0.26959275571562236


from sklearn.feature_extraction.text import TfidfVectorizer

class Text2TfIdfTransformer(BaseEstimator):

    def __init__(self):
        self._model = TfidfVectorizer()
        pass

    def fit(self, df_x, df_y=None):
        df_x = df_x.apply(lambda x : clean_text(x))
        self._model.fit(df_x)
        return self

    def transform(self, df_x):
        return self._model.transform(df_x)


tfidf_transformer = Text2TfIdfTransformer()
tfidf_vectors = tfidf_transformer.fit(df_x).transform(df_x)


tfidf_vectors.shape

(7882, 43292)


print(tfidf_vectors)

  (0, 42852)	0.10716138942692059
  (0, 42833)	0.16291259128557395
  (0, 42365)	0.1374588226926075
  (0, 42353)	0.06082974395699173
  (0, 42348)	0.044435190334657586
  (0, 42294)	0.042336825128041385
  (0, 42287)	0.011432118887967885
  (0, 42172)	0.15682012940168905
  (0, 42078)	0.04504563919250176
  (0, 41932)	0.08467365025608277
  (0, 41913)	0.036371471933238524
  (0, 41874)	0.020424454389634677
  (0, 40770)	0.026833295365657914
  (0, 40575)	0.020288743969423657
  (0, 40373)	0.044435190334657586
  (0, 39668)	0.04592400414213082
  (0, 39436)	0.04153906370133075
  (0, 39105)	0.5237110079112914
  (0, 39013)	0.04802236934874703
  (0, 38906)	0.019217029254805996
  (0, 38810)	0.039199947859783306
  (0, 38712)	0.028741140700716133
  (0, 38670)	0.01968054574204271
  (0, 38578)	0.02912946319827759
  (0, 38528)	0.12839240829816084
  :	:
  (7881, 29239)	0.21975827873827963
  (7881, 29192)	0.11730330184873854
  (7881, 29084)	0.11653154836178616
  (7881, 29059)	0.03703631468681082
  (7881, 29030)	0.1366225104087197
  (7881, 26017)	0.09746878334638821
  (7881, 19589)	0.16563806512953597
  (7881, 17646)	0.12027706336883416
  (7881, 16992)	0.044599056662615315
  (7881, 16902)	0.2294546248991148
  (7881, 15753)	0.20305728782722235
  (7881, 12849)	0.11307748159074872
  (7881, 12511)	0.08503480180054757
  (7881, 11150)	0.14796165836492717
  (7881, 8486)	0.09694092529727945
  (7881, 8460)	0.35801950129865184
  (7881, 5599)	0.14999419592850963
  (7881, 5306)	0.09732848262334262
  (7881, 2078)	0.13557948534848502
  (7881, 1783)	0.11730330184873854
  (7881, 1764)	0.16563806512953597
  (7881, 1556)	0.1399580860168509
  (7881, 1174)	0.0352630441133428
  (7881, 1131)	0.13107275948944294
  (7881, 1099)	0.13889469408995614


pl_log_reg_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                                    ('log_reg', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100))])
scores = cross_val_score(pl_log_reg_tf_idf, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for Tf-Idf & Logistic Regression: ', scores.mean())

Accuracy for Tf-Idf & Logistic Regression:  0.6136588119830015


pl_random_forest_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                                          ('random_forest', RandomForestClassifier())])
scores = cross_val_score(pl_random_forest_tf_idf, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for Tf-Idf & RandomForest : ', scores.mean())

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

Accuracy for Tf-Idf & RandomForest :  0.48855797090863673


pl_xgb_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                         ('xgboost', xgb.XGBClassifier(objective='multi:softmax'))])
scores = cross_val_score(pl_xgb_tf_idf, df_x, df_y, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier : ', scores.mean())

	Unnamed: 0	Link	Title	Category	Post Author	Leading Comment	Publish Time	Reply Authors	Reply Comments	Reply Times
0	0	http://sellercentral.amazon.com/forums/t/2-box...	2 boxes of books	Fulfillment By Amazon	Earth_Light_Books	I need to know how to start	2020-05-26 17:33:26+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nI need to know how to start\n', ' \nAmazon...	[Timestamp('2020-05-26 17:48:57+0000', tz='UTC...
1	1	http://sellercentral.amazon.com/forums/t/fba-s...	FBA Shipment Delivered Delayed Check in	Fulfillment By Amazon	HOUSE_OF_LORDS_RODEO	Hi are there any other seller experiencing del...	2020-05-26 15:28:17+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nHi are there any other seller experiencing...	[Timestamp('2020-05-26 16:03:50+0000', tz='UTC')]
2	2	http://sellercentral.amazon.com/forums/t/fba-r...	FBA removal/disposal fees	Fulfillment By Amazon	Texastoys	Hi all\nSo back when the new removaldisposal f...	2020-05-26 05:43:46+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nHi all\nSo back when the new removaldispos...	[]
3	3	http://sellercentral.amazon.com/forums/t/am-i-...	Am i able to send in FBA inventory now?	Fulfillment By Amazon	Assuranceproducts	Im using inventorylab to upload my inventory t...	2020-05-25 16:53:40+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nIm using inventorylab to upload my invento...	[Timestamp('2020-05-25 17:14:44+0000', tz='UTC...
4	4	http://sellercentral.amazon.com/forums/t/stran...	Stranded Inventory	Fulfillment By Amazon	BillsBuys	I recently listed 100 books for sale on Amazon...	2020-05-24 02:14:21+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nI recently listed 100 books for sale on Am...	[Timestamp('2020-05-25 01:42:41+0000', tz='UTC...

Getting the data¶

Data Exploration & Visualisation¶

Building the Machine Learning model & pipeline¶

Converting to Doc2Vec¶

Pipeline with Doc2Vec & LogisticRegression¶

Pipeline with Doc2Vec & RandomForest¶

Pipeline with Doc2Vec & XGBoost¶

Converting to Tf-Idf¶

Pipeline with Tf-Idf & LogisticRegression¶

Pipeline with Tf-Idf & RandomForest¶

Pipeline with Tf-Idf & XGBoost¶