import pandas as pd
amazon_text_df = pd.read_csv('minClean_amazon.csv')
amazon_text_df.head()
Unnamed: 0 | Link | Title | Category | Post Author | Leading Comment | Publish Time | Reply Authors | Reply Comments | Reply Times | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | http://sellercentral.amazon.com/forums/t/2-box... | 2 boxes of books | Fulfillment By Amazon | Earth_Light_Books | I need to know how to start | 2020-05-26 17:33:26+00:00 | <function get_reply_authors at 0x00000119FBB60... | ['\nI need to know how to start\n', ' \nAmazon... | [Timestamp('2020-05-26 17:48:57+0000', tz='UTC... |
1 | 1 | http://sellercentral.amazon.com/forums/t/fba-s... | FBA Shipment Delivered Delayed Check in | Fulfillment By Amazon | HOUSE_OF_LORDS_RODEO | Hi are there any other seller experiencing del... | 2020-05-26 15:28:17+00:00 | <function get_reply_authors at 0x00000119FBB60... | ['\nHi are there any other seller experiencing... | [Timestamp('2020-05-26 16:03:50+0000', tz='UTC')] |
2 | 2 | http://sellercentral.amazon.com/forums/t/fba-r... | FBA removal/disposal fees | Fulfillment By Amazon | Texastoys | Hi all\nSo back when the new removaldisposal f... | 2020-05-26 05:43:46+00:00 | <function get_reply_authors at 0x00000119FBB60... | ['\nHi all\nSo back when the new removaldispos... | [] |
3 | 3 | http://sellercentral.amazon.com/forums/t/am-i-... | Am i able to send in FBA inventory now? | Fulfillment By Amazon | Assuranceproducts | Im using inventorylab to upload my inventory t... | 2020-05-25 16:53:40+00:00 | <function get_reply_authors at 0x00000119FBB60... | ['\nIm using inventorylab to upload my invento... | [Timestamp('2020-05-25 17:14:44+0000', tz='UTC... |
4 | 4 | http://sellercentral.amazon.com/forums/t/stran... | Stranded Inventory | Fulfillment By Amazon | BillsBuys | I recently listed 100 books for sale on Amazon... | 2020-05-24 02:14:21+00:00 | <function get_reply_authors at 0x00000119FBB60... | ['\nI recently listed 100 books for sale on Am... | [Timestamp('2020-05-25 01:42:41+0000', tz='UTC... |
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
sns.countplot(x=amazon_text_df.Category, color='green')
plt.title('Amazon text class distribution', fontsize=16)
plt.ylabel('Class Counts', fontsize=16)
plt.xlabel('Class Label', fontsize=16)
plt.xticks(rotation='vertical');
amazon_text_df.drop(columns= ['Unnamed: 0', 'Link', 'Publish Time', 'Reply Times', 'Reply Authors'], inplace=True)
df = amazon_text_df.copy()
df['text'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment'] + ' ' + df['Reply Comments']
from gensim import utils
import gensim.parsing.preprocessing as gsp
filters = [
gsp.strip_tags,
gsp.strip_punctuation,
gsp.strip_multiple_whitespaces,
gsp.strip_numeric,
gsp.remove_stopwords,
gsp.strip_short,
gsp.stem_text
]
def clean_text(s):
s = s.lower()
s = utils.to_unicode(s)
for f in filters:
s = f(s)
return s
df.iloc[2,1]
'Fulfillment By Amazon'
clean_text(df.iloc[2,1])
'fulfil amazon'
%matplotlib inline
from wordcloud import WordCloud
def plot_word_cloud(text):
wordcloud_instance = WordCloud(width = 800, height = 800,
background_color ='black',
stopwords=None,
min_font_size = 10).generate(text)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_instance)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
texts = ''
for index, item in df.iterrows():
texts = texts + ' ' + clean_text(item['text'])
plot_word_cloud(texts)
It seems like mostly the posts are about selling, sellers, longer reply, I guess getting advice as well.
df.rename(columns={"Category": "category"}, inplace=True)
def plot_word_cloud_for_category(df, category):
text_df = df.loc[df['category'] == str(category)]
texts = ''
for index, item in text_df.iterrows():
texts = texts + ' ' + clean_text(item['text'])
plot_word_cloud(texts)
df.category.unique()
array(['Fulfillment By Amazon', 'Selling on Amazon', 'Amazon Marketplace Web Service (MWS)', 'Amazon Sponsored Products', 'Account Health', 'Global Selling', 'Amazon Pay', 'Groups', 'Health,Safety,Sustainability,Security & Compliance', 'Login With Amazon', 'Amazon Custom'], dtype=object)
plot_word_cloud_for_category(df,'Fulfillment By Amazon')
Fulfillment By Amazon: order, shipment, time. I guess this is mostly about reviews either time taken to receive the product or the state of the product itself.
plot_word_cloud_for_category(df,'Selling on Amazon')
Selling on Amazon: I guess it's about asking for help about selling in amazon and asking about other peoples experience or results (sale).
plot_word_cloud_for_category(df,'Health,Safety,Sustainability,Security & Compliance')
There are several words common between the categories (normal) which are : product, amazon, issu, item, help. It would be intersting to investigate how much times they appear (perhaps get ride of them and see if accuracy is going to improve?).
df_x = df['text']
df_y = df['category']
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm
import multiprocessing
import numpy as np
class Doc2VecTransformer(BaseEstimator):
def __init__(self, vector_size=100, learning_rate=0.02, epochs=20):
self.learning_rate = learning_rate
self.epochs = epochs
self._model = None
self.vector_size = vector_size
self.workers = multiprocessing.cpu_count() - 1
def fit(self, df_x, df_y=None):
tagged_x = [TaggedDocument(clean_text(row).split(), [index]) for index, row in enumerate(df_x)]
model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)
for epoch in range(self.epochs):
model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
model.alpha -= self.learning_rate
model.min_alpha = model.alpha
self._model = model
return self
def transform(self, df_x):
return np.asmatrix(np.array([self._model.infer_vector(clean_text(row).split())
for index, row in enumerate(df_x)]))
doc2vec_trf = Doc2VecTransformer()
doc2vec_features = doc2vec_trf.fit(df_x).transform(df_x)
doc2vec_features
100%|██████████| 7882/7882 [00:00<00:00, 1566058.94it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1229069.23it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1261764.98it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1554132.39it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1631601.23it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1748440.03it/s] 100%|██████████| 7882/7882 [00:00<00:00, 2096885.97it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1651488.87it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1841139.68it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1581416.13it/s] 100%|██████████| 7882/7882 [00:00<00:00, 837712.96it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1828411.27it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1725173.73it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1696577.24it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1460999.83it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1979848.13it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1842884.45it/s] 100%|██████████| 7882/7882 [00:00<00:00, 2120558.31it/s] 100%|██████████| 7882/7882 [00:00<00:00, 2062866.85it/s] 100%|██████████| 7882/7882 [00:00<00:00, 1675595.75it/s]
matrix([[-0.00277872, -0.00501187, 0.00482856, ..., -0.0008983 , 0.00663877, -0.00025027], [ 0.00234055, -0.00532782, 0.00668937, ..., -0.00276364, 0.00571931, 0.00571155], [ 0.00426091, -0.00314035, -0.0013386 , ..., 0.00154023, -0.00435641, 0.00448706], ..., [ 0.02304544, 0.1583538 , -0.05634788, ..., 0.05963844, -0.1528518 , -0.06593791], [-0.0044461 , -0.00192232, 0.00569911, ..., -0.00424831, 0.00223549, -0.0003378 ], [-0.00429653, -0.00084236, 0.00440887, ..., -0.00040945, 0.00469861, -0.00300767]], dtype=float32)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
pl_log_reg = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
('log_reg', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100))])
scores = cross_val_score(pl_log_reg, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for Logistic Regression: ', scores.mean())
100%|██████████| 6303/6303 [00:00<00:00, 2303851.69it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2435664.10it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1995222.50it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2332718.44it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2670913.13it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2559215.69it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2473724.91it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2250889.58it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2697071.83it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2193006.89it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2534435.64it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1661744.81it/s] 100%|██████████| 6303/6303 [00:00<00:00, 3106180.02it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2819012.38it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2712290.77it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2482319.07it/s] 100%|██████████| 6303/6303 [00:00<00:00, 950105.95it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1756357.83it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1204624.90it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1379210.04it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning) 100%|██████████| 6304/6304 [00:00<00:00, 2063598.88it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1545437.63it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1926617.05it/s] 100%|██████████| 6304/6304 [00:00<00:00, 979945.61it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1394929.70it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1944469.22it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1377919.25it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1333042.22it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1680173.63it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1746195.51it/s] 100%|██████████| 6304/6304 [00:00<00:00, 735674.92it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1623933.94it/s] 100%|██████████| 6304/6304 [00:00<00:00, 912163.81it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1613134.79it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1609403.64it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1676976.75it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1697758.60it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1627231.98it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1687465.21it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1688650.68it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning) 100%|██████████| 6306/6306 [00:00<00:00, 2755133.44it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2753699.22it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2436598.90it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2510372.15it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2537588.12it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2697254.85it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2460398.23it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2649432.14it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2546873.47it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2494509.20it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2433908.26it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2469356.83it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2362808.74it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2369582.60it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2703872.52it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2408859.84it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2439520.48it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2706362.53it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2750835.26it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2532728.24it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning) 100%|██████████| 6307/6307 [00:00<00:00, 2441934.40it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2576302.62it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1819233.57it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2049544.85it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2470440.36it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2581834.41it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2653573.61it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2815397.54it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2791923.52it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2452120.44it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2632186.60it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2724353.79it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2769997.42it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2847521.56it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1996488.70it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1926410.96it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2679375.60it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2342674.05it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2764786.30it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2717357.51it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning) 100%|██████████| 6308/6308 [00:00<00:00, 2287142.95it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2558273.99it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2702795.96it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2530381.56it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2687695.01it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2577715.28it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2559016.31it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2705006.61it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2395876.99it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2345746.04it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2514270.61it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2808371.68it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2176690.22it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2333951.10it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2744002.24it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2083773.30it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2416887.70it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2383144.45it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2520258.11it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2826372.14it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/sag.py:337: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning)
Accuracy for Logistic Regression: 0.2959826923098424
from sklearn.ensemble import RandomForestClassifier
pl_random_forest = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
('random_forest', RandomForestClassifier())])
scores = cross_val_score(pl_random_forest, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for RandomForest : ', scores.mean())
100%|██████████| 6303/6303 [00:00<00:00, 2181785.76it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1786987.84it/s] 100%|██████████| 6303/6303 [00:00<00:00, 3061220.25it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2021463.38it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2512038.97it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1069921.81it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1687628.35it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2131476.10it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2452383.87it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2688022.18it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2330867.41it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2524271.76it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2595905.16it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2381040.99it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2127017.31it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2115782.16it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1683115.69it/s] 100%|██████████| 6303/6303 [00:00<00:00, 998402.44it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2239069.88it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2495205.11it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) 100%|██████████| 6304/6304 [00:00<00:00, 2086725.00it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2829415.99it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2718298.80it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2130611.80it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2342597.01it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2012704.00it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1431403.88it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2404154.61it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1747349.49it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2086066.46it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2668910.11it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2262032.03it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2101318.64it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1781371.18it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2479918.63it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2867775.75it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2434928.85it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1869935.81it/s] 100%|██████████| 6304/6304 [00:00<00:00, 850517.64it/s] 100%|██████████| 6304/6304 [00:00<00:00, 1454794.63it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) 100%|██████████| 6306/6306 [00:00<00:00, 2514429.23it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1850894.40it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2555980.00it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2049378.66it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2398375.14it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2603531.94it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2589766.08it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2672184.38it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2459711.80it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1798047.66it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2046682.74it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2859690.89it/s] 100%|██████████| 6306/6306 [00:00<00:00, 616590.85it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2104828.98it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2378746.38it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2941096.52it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2988619.32it/s] 100%|██████████| 6306/6306 [00:00<00:00, 3075855.45it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2299737.50it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2222815.45it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) 100%|██████████| 6307/6307 [00:00<00:00, 2452120.44it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2297904.39it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2280078.89it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2786923.23it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2462621.05it/s] 100%|██████████| 6307/6307 [00:00<00:00, 3022218.13it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2819598.73it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2203721.70it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2513633.16it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2066516.31it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2216276.42it/s] 100%|██████████| 6307/6307 [00:00<00:00, 773267.33it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1731361.69it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2344750.52it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2163529.51it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1942965.50it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2569046.84it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1970904.14it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2047799.61it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2839574.42it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) 100%|██████████| 6308/6308 [00:00<00:00, 2165466.49it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2567210.33it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2405898.85it/s] 100%|██████████| 6308/6308 [00:00<00:00, 448427.48it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2179559.24it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2166175.67it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2552597.17it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1807958.84it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2014747.92it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2218672.51it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2570452.70it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2478470.22it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2438270.17it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2429091.96it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1497321.43it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2001185.21it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1722504.53it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1561016.56it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1046957.76it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1659932.85it/s] /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Accuracy for RandomForest : 0.22430165617671932
import xgboost as xgb
pl_xgb = Pipeline(steps=[('doc2vec',Doc2VecTransformer()),
('xgboost', xgb.XGBClassifier(objective='multi:softmax'))])
scores = cross_val_score(pl_xgb, df_x, df_y, cv=5)
print('Accuracy for XGBoost Classifier : ', scores.mean())
100%|██████████| 6303/6303 [00:00<00:00, 1376481.21it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2049197.59it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2407494.59it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2462205.28it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1852736.57it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1289720.86it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1274610.58it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1028385.19it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2179447.49it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1854165.95it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2828967.16it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1495372.93it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1299611.55it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1319986.92it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1619002.89it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1973919.07it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2372280.88it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2448976.20it/s] 100%|██████████| 6303/6303 [00:00<00:00, 2348467.45it/s] 100%|██████████| 6303/6303 [00:00<00:00, 1592092.63it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2327748.25it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2446418.62it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2089528.40it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2446192.29it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2164093.34it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2529744.78it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2528777.01it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2188272.15it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2415134.49it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2784424.22it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2688995.47it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2676745.54it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2378633.72it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2239425.12it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2731215.00it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2412490.18it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2733756.45it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2445287.38it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2533866.07it/s] 100%|██████████| 6304/6304 [00:00<00:00, 2368194.57it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2300737.74it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1965028.31it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2264299.38it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2304948.24it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2221881.81it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2577400.22it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2625238.81it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1872382.91it/s] 100%|██████████| 6306/6306 [00:00<00:00, 949568.50it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1060006.45it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1798659.03it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1282949.22it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2499459.56it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2019337.38it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1901731.45it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2328691.76it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2341680.48it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2450595.85it/s] 100%|██████████| 6306/6306 [00:00<00:00, 1058945.47it/s] 100%|██████████| 6306/6306 [00:00<00:00, 2042100.14it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2591191.63it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2837138.07it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2846296.03it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2494904.77it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2105162.77it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1873210.26it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1499885.20it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2179213.72it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2828341.21it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2848441.40it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2537990.53it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2781063.43it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2897106.05it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2076085.02it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2133688.93it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1772664.70it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1661441.74it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1899165.43it/s] 100%|██████████| 6307/6307 [00:00<00:00, 1988983.11it/s] 100%|██████████| 6307/6307 [00:00<00:00, 2798125.17it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2586282.47it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2802422.37it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2459119.77it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2433784.35it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2290707.33it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1647426.50it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1721271.85it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2463011.51it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2183336.33it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1095646.42it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1416515.13it/s] 100%|██████████| 6308/6308 [00:00<00:00, 1971951.23it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2488494.13it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2495300.35it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2205540.98it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2327585.96it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2361870.17it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2593634.90it/s] 100%|██████████| 6308/6308 [00:00<00:00, 601953.67it/s] 100%|██████████| 6308/6308 [00:00<00:00, 2506173.12it/s]
Accuracy for XGBoost Classifier : 0.26959275571562236
from sklearn.feature_extraction.text import TfidfVectorizer
class Text2TfIdfTransformer(BaseEstimator):
def __init__(self):
self._model = TfidfVectorizer()
pass
def fit(self, df_x, df_y=None):
df_x = df_x.apply(lambda x : clean_text(x))
self._model.fit(df_x)
return self
def transform(self, df_x):
return self._model.transform(df_x)
tfidf_transformer = Text2TfIdfTransformer()
tfidf_vectors = tfidf_transformer.fit(df_x).transform(df_x)
tfidf_vectors.shape
(7882, 43292)
print(tfidf_vectors)
(0, 42852) 0.10716138942692059 (0, 42833) 0.16291259128557395 (0, 42365) 0.1374588226926075 (0, 42353) 0.06082974395699173 (0, 42348) 0.044435190334657586 (0, 42294) 0.042336825128041385 (0, 42287) 0.011432118887967885 (0, 42172) 0.15682012940168905 (0, 42078) 0.04504563919250176 (0, 41932) 0.08467365025608277 (0, 41913) 0.036371471933238524 (0, 41874) 0.020424454389634677 (0, 40770) 0.026833295365657914 (0, 40575) 0.020288743969423657 (0, 40373) 0.044435190334657586 (0, 39668) 0.04592400414213082 (0, 39436) 0.04153906370133075 (0, 39105) 0.5237110079112914 (0, 39013) 0.04802236934874703 (0, 38906) 0.019217029254805996 (0, 38810) 0.039199947859783306 (0, 38712) 0.028741140700716133 (0, 38670) 0.01968054574204271 (0, 38578) 0.02912946319827759 (0, 38528) 0.12839240829816084 : : (7881, 29239) 0.21975827873827963 (7881, 29192) 0.11730330184873854 (7881, 29084) 0.11653154836178616 (7881, 29059) 0.03703631468681082 (7881, 29030) 0.1366225104087197 (7881, 26017) 0.09746878334638821 (7881, 19589) 0.16563806512953597 (7881, 17646) 0.12027706336883416 (7881, 16992) 0.044599056662615315 (7881, 16902) 0.2294546248991148 (7881, 15753) 0.20305728782722235 (7881, 12849) 0.11307748159074872 (7881, 12511) 0.08503480180054757 (7881, 11150) 0.14796165836492717 (7881, 8486) 0.09694092529727945 (7881, 8460) 0.35801950129865184 (7881, 5599) 0.14999419592850963 (7881, 5306) 0.09732848262334262 (7881, 2078) 0.13557948534848502 (7881, 1783) 0.11730330184873854 (7881, 1764) 0.16563806512953597 (7881, 1556) 0.1399580860168509 (7881, 1174) 0.0352630441133428 (7881, 1131) 0.13107275948944294 (7881, 1099) 0.13889469408995614
pl_log_reg_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
('log_reg', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=100))])
scores = cross_val_score(pl_log_reg_tf_idf, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for Tf-Idf & Logistic Regression: ', scores.mean())
Accuracy for Tf-Idf & Logistic Regression: 0.6136588119830015
pl_random_forest_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
('random_forest', RandomForestClassifier())])
scores = cross_val_score(pl_random_forest_tf_idf, df_x, df_y, cv=5,scoring='accuracy')
print('Accuracy for Tf-Idf & RandomForest : ', scores.mean())
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Accuracy for Tf-Idf & RandomForest : 0.48855797090863673
pl_xgb_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
('xgboost', xgb.XGBClassifier(objective='multi:softmax'))])
scores = cross_val_score(pl_xgb_tf_idf, df_x, df_y, cv=5)
print('Accuracy for Tf-Idf & XGBoost Classifier : ', scores.mean())