import pandas as pd

data = pd.read_csv('minClean_amazon.csv')
data.head()


data['Leading Comment'][2]

'Hi all\nSo back when the new removaldisposal fees were announced I assumed it was a typo that it costs the same to have Amazon “dispose” aka sell to liquidators while charging us or recall the item but now it says it in the help files Has anyone tried a disposal and gotten charged the same as a return Just seems crazy to me that they would do that'


data.columns

Index(['Unnamed: 0', 'Link', 'Title', 'Category', 'Post Author',
       'Leading Comment', 'Publish Time', 'Reply Authors', 'Reply Comments',
       'Reply Times'],
      dtype='object')


data[['Unnamed: 0', 'Link', 'Publish Time', 'Reply Times', 'Reply Authors']]


data.drop(columns= ['Unnamed: 0', 'Link', 'Publish Time', 'Reply Times', 'Reply Authors'], inplace=True)


data.head()


data['Leading Comment'][3]

'Im using inventorylab to upload my inventory to amazon I was selling books when i started and other random things in toy category I just want to know a real answer can i send in products now or what this is driving me crazy I have noticed the buy boxes switch to favor more prime now so thats why im wondering did i miss something'


data['Leading Comment'][:3]

0                          I need to know how to start
1    Hi are there any other seller experiencing del...
2    Hi all\nSo back when the new removaldisposal f...
Name: Leading Comment, dtype: object


data['Reply Comments'][0]

"['\\nI need to know how to start\\n', ' \\nAmazon is not the platform for a new seller with a random box of books\\nHave you completed seller university\\n', ' \\nThank you for responding to me  No have not been through any training  Would Abesbooks be better to work with\\nI didn’t realize that it would be so complicated\\nAgain thank you\\nCheri\\n', ' \\nAmazon is more aimed at real businesses intending to build an ongoing presence While it’s possible for an individual to sell a few random items it’s not often the best place\\nBefore deciding where to sell your books the first step would be to find if they are even worth trying to sell Amazon is a good place to use for this and you can do it from a buyer account\\nSimply search each book by its ISBN or if no ISBN titleauthor and see what it is selling for While it’s possible that you can get more on another platform if a book is selling for 5 on Amazon it’s unlikely that you’ll be able to get enough anywhere to make it really worth your time listing by the time you count packaging you might end up paying to dispose of your books\\nUnless a significant number of your books list for more than 10 and that’s still really low you would probably be better off donating them I find that around 95 of books are simply not worth listing with fiction it’s even worse\\nI’ve never worked with Abebooks but have had good luck on the bay with books that for one reason or another cannot be sold on Amazon\\n', ' \\nStarting to sell books may not be worthwhile currently Hard to know if you will grade properly or overgrade Hard to know if you will get or have access to more sellable inventory\\nAbe is fairly easy to navigate and price check also\\nMany booksellers sell on both platforms since Amazon bought Abe\\nSome books I have sold on Ebay for cheap else donate\\n', '']"


import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline


df = data.copy()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7882 entries, 0 to 7881
Data columns (total 6 columns):
Title              7882 non-null object
Category           7882 non-null object
Post Author        7882 non-null object
Leading Comment    7882 non-null object
Reply Comments     7882 non-null object
post               7882 non-null object
dtypes: object(6)
memory usage: 369.6+ KB


df['post'] = df['Post Author'] + ' ' + df['Title'] + ' ' + df['Leading Comment'] + ' ' + df['Reply Comments']


df['post'].head()

0    Earth_Light_Books 2 boxes of books I need to k...
1    HOUSE_OF_LORDS_RODEO FBA Shipment Delivered De...
2    Texastoys FBA removal/disposal fees Hi all\nSo...
3    Assuranceproducts Am i able to send in FBA inv...
4    BillsBuys Stranded Inventory I recently listed...
Name: post, dtype: object


df['post'][1]

"HOUSE_OF_LORDS_RODEO FBA Shipment Delivered Delayed Check in Hi are there any other seller experiencing delay in Checking in of Shipped FBA Inventory We’ve sent it last May 21st and we’ve sent to four different fulfillment center The three were already received However the last one which was the biggest shipment has not yet been received Is it because of the location of the Fulfillment Center I am just worried that we already amped up our ad spending because the other shipments were already received but this last shipment haven’t been checked in so it’s greatly affecting our sales Thanks for any input ['\\nHi are there any other seller experiencing delay in Checking in of Shipped FBA Inventory We’ve sent it last May 21st and we’ve sent to four different fulfillment center The three were already received However the last one which was the biggest shipment has not yet been received Is it because of the location of the Fulfillment Center I am just worried that we already amped up our ad spending because the other shipments were already received but this last shipment haven’t been checked in so it’s greatly affecting our sales Thanks for any input\\n', ' \\nEDIT Wrong date We’ve sent ALL SHIPMENT LAST MAY 08 Thank you\\n', '']"


print(df['post'].apply(lambda x: len(x.split(' '))).sum())

4986021


df['Category'].unique()

array(['Fulfillment By Amazon', 'Selling on Amazon',
       'Amazon Marketplace Web Service (MWS)',
       'Amazon Sponsored Products', 'Account Health', 'Global Selling',
       'Amazon Pay', 'Groups',
       'Health,Safety,Sustainability,Security & Compliance',
       'Login With Amazon', 'Amazon Custom'], dtype=object)


my_categories = ['Fulfillment By Amazon', 'Selling on Amazon',
       'Amazon Marketplace Web Service (MWS)',
       'Amazon Sponsored Products', 'Account Health', 'Global Selling',
       'Amazon Pay', 'Groups',
       'Health,Safety,Sustainability,Security & Compliance',
       'Login With Amazon', 'Amazon Custom']
plt.figure(figsize=(10,4))
df.Category.value_counts().plot(kind='bar');


def print_plot(index):
    example = df[df.index == index][['post', 'Category']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Category:', example[1])
print_plot(10)

Book_Corral First FBA shipment since pandemic Now that we’re living under “pandemic” rules can I ship anything to FBA fulfillment
Will I be alerted while making an FBA shipment
I don’t want to ship stuff in and have Amazon tell me they can’t take this or that right now and have to deal with returning items or whatever
Can I just start an FBA shipment and the system will take care of everything ['\nNow that we’re living under “pandemic” rules can I ship anything to FBA fulfillment\nWill I be alerted while making an FBA shipment\nI don’t want to ship stuff in and have Amazon tell me they can’t take this or that right now and have to deal with returning items or whatever\nCan I just start an FBA shipment and the system will take care of everything\n', ' \nYes Already many threads on this topic Just search around FBA is open for business but quantities severely limited and shipping times very slow May be better to stick with fbm for a while\n', ' \nAmazon will either let you make the shipment or not make it You’ll figure it out pretty early in the process\nHowever what the WON’T tell you is how long it may take for your shipment to be received I don’t think enough people have shipped in nonessential items yet to have a good idea of how fast receiving is happening but I suspect that the delays are longer than normal On top of that we’re still looking at FBA orders taking extended times to ship so customers are buying more FBM FBM even getting the BB over FBA in some cases\nPersonally I’m not shipping any books in right now for these reasons as well as the fact that I don’t want to go to the UPS store to drop off a box Not that it’s a major issue with all the library booksales cancelled I don’t have that many books to list anyway\nIf you decide to ship stuff in let us know how it goes\n', ' \nTry creating a shipment If it allows you create it you should be ok although you’ll likely be subject to quantity limits  But as picksbynisha noted check other Forum threads  There are scattered anecdotal reports of shipments being lost not properly accounted for etc\n', ' \nI’ve sent three shipments to FTW1 in the last month\nFirst one was a pallet delivered with LTL shipping and from shipment to receiving was six days\nSecond one was a few boxes via UPS shipment to receiving was one day\nThird one was a few more boxes via UPS shipped five days ago delivered three days ago still not checkedin or receiving\nIn the last year my 18 shipments via FedEx and UPS to FTW have ranged from 2 to 19 days from shipment to receiving typical transit time is 12 days with an average of 6 days So overall things at that particular warehouse for me have been moving at normal to aboveaverage speed\n', ' \nBy “quantities limited” I assume you mean FBA is only allowing me to send a limited quantity of some items If this is so how will I know if I’m over limit I typically only sell one of each item\nI guess I’m wondering if there items I was able to list on Amazon which are not allowed to be sent on to FBA Can I go ahead and make a shipment and ship it and be confidant it will be accepted\nI have limited storage and would rather wait a while for FBA to receive my shipment than hold on to it Since it sellers faster as FBA I’ll start selling more sooner that way\n', ' \nI have a good bit for me of FBM and it’s not selling well I had been doing well with FBA but my inventory is running out I know from experience that as soon as I can ship and get listed my present FBM as FBA it will start selling\nI’m guessing Amazon is dealing with a lot of new hires maybe lowering their hiring standards as well as dealing with employee descent which is affecting FBA intake performance\n', ' \nYes receiving may be very slow right now My 200pcs was delivered on Monday 518 and Amazon has yet to start receiving my inventory on 523\nSounds like it depends on where you’re sending it though\n', '']
Category: Fulfillment By Amazon


print_plot(30)

ThisIsPapaya Adding Product for FBA but NOT for listing on Amazon I want to set up my products on Amazon so that I can start setting up my shipment to the amazon warehouse for fulfillment on my website orders My understanding is that in order to do this I need to “add a product”
However I will not be listing these products on Amazon for sale So how to “add a product” without having it available for sale on amazon but still exist so that I can set up a shipment to the fulfillment center with that product ['\nI want to set up my products on Amazon so that I can start setting up my shipment to the amazon warehouse for fulfillment on my website orders My understanding is that in order to do this I need to “add a product”\nHowever I will not be listing these products on Amazon for sale So how to “add a product” without having it available for sale on amazon but still exist so that I can set up a shipment to the fulfillment center with that product\n', ' \nI don’t think amazon is set up to work that way I’m still trying to figure out why you would want to do this even if you could What advantages do you think you would gain in doing so\n', ' \nBefore I give my answer to your question I would like to add that I think you would benefit from listing your product on Amazon It’s unlikely to have an effect on your existing website sales but it could definitely add a whole new revenue source to your business\nAmazon offers ‘Multichannel fulfilment’ which allows you to essentially copy orders from your website or any other marketplace and have them fulfilled from an Amazon warehouse\nHowever if you just need the fulfilment facilities then there are other companies which do this that are much less expensive than Amazon Amazon charges higher fees for FBA if the product was not sold on their site But if you’re selling on Amazon and your website then FBA is the way to go and is what I do since I live in the UK but most of my orders are in the USA so it’s not economically to ship every order internationally\nHope this helps a bit\n', ' \nyou can also look into other fulfillment places then you don’t have to deal with any product issues you may or may not have with amazon\nWe use one for another marketplace and they work really well\n', ' \nYou can create the product  listing as FBA send it in to the Fulfillment Center then while it’s in transit switch to FBM You’ll always need to switch to FBA to send in more stuff but it will mean that you have an active listing stock on hand and no display on the Amazon site\nTo the other posters the OP may have found that Amazon’s commission is too large for him to sell it on their marketplace\n', ' \nIf you do not want to sell FBA there are so many other perhaps better and less expensive fulfillment houses Use them\n', ' \n\n\n', '\n ThisIsPapaya', '\n\nSo how to “add a product” without having it available for sale on amazon\n\n\nYou can’t\n', ' \nThanks for the insights everyone My comments to all the questionscomments are below PS the products I’ll be selling is under my own brand\n\n\nI would consider NOT selling on Amazon and only selling on my website in the scenario where my customer acquisition costs is less than the 15 amazon charges me If I can acquire customers through Instagram targeted Adsinfluencers for a cost of lets say 10 of my selling price then I prefer use my inventory for sales outside of Amazon which are more profitable\n\n\nThat being said in case I can’t reach high volumes through my other avenues of advertising or my customer acquisition cost is higher than 15 I’ll then want to turn on the amazon listing as well This is why I’d like to use FBA and not another fulfillment center  so I can easily toggle on Amazon sales if necessary\n\n\nYeStorePosters thanks for the MultiFulfillment Comment Sounds like this is the avenue to utilizing FBA without listing a product on Amazon I will do some research on this any further insight from you on this matter can be helpful At the same time I will start looking into other fulfillment providers  putting cost aside for a moment how much better or worse are they than FBA\n', '']
Category: Fulfillment By Amazon


test_df = df.copy()


print_plot(10)

Book_Corral First FBA shipment since pandemic Now that we’re living under “pandemic” rules can I ship anything to FBA fulfillment
Will I be alerted while making an FBA shipment
I don’t want to ship stuff in and have Amazon tell me they can’t take this or that right now and have to deal with returning items or whatever
Can I just start an FBA shipment and the system will take care of everything ['\nNow that we’re living under “pandemic” rules can I ship anything to FBA fulfillment\nWill I be alerted while making an FBA shipment\nI don’t want to ship stuff in and have Amazon tell me they can’t take this or that right now and have to deal with returning items or whatever\nCan I just start an FBA shipment and the system will take care of everything\n', ' \nYes Already many threads on this topic Just search around FBA is open for business but quantities severely limited and shipping times very slow May be better to stick with fbm for a while\n', ' \nAmazon will either let you make the shipment or not make it You’ll figure it out pretty early in the process\nHowever what the WON’T tell you is how long it may take for your shipment to be received I don’t think enough people have shipped in nonessential items yet to have a good idea of how fast receiving is happening but I suspect that the delays are longer than normal On top of that we’re still looking at FBA orders taking extended times to ship so customers are buying more FBM FBM even getting the BB over FBA in some cases\nPersonally I’m not shipping any books in right now for these reasons as well as the fact that I don’t want to go to the UPS store to drop off a box Not that it’s a major issue with all the library booksales cancelled I don’t have that many books to list anyway\nIf you decide to ship stuff in let us know how it goes\n', ' \nTry creating a shipment If it allows you create it you should be ok although you’ll likely be subject to quantity limits  But as picksbynisha noted check other Forum threads  There are scattered anecdotal reports of shipments being lost not properly accounted for etc\n', ' \nI’ve sent three shipments to FTW1 in the last month\nFirst one was a pallet delivered with LTL shipping and from shipment to receiving was six days\nSecond one was a few boxes via UPS shipment to receiving was one day\nThird one was a few more boxes via UPS shipped five days ago delivered three days ago still not checkedin or receiving\nIn the last year my 18 shipments via FedEx and UPS to FTW have ranged from 2 to 19 days from shipment to receiving typical transit time is 12 days with an average of 6 days So overall things at that particular warehouse for me have been moving at normal to aboveaverage speed\n', ' \nBy “quantities limited” I assume you mean FBA is only allowing me to send a limited quantity of some items If this is so how will I know if I’m over limit I typically only sell one of each item\nI guess I’m wondering if there items I was able to list on Amazon which are not allowed to be sent on to FBA Can I go ahead and make a shipment and ship it and be confidant it will be accepted\nI have limited storage and would rather wait a while for FBA to receive my shipment than hold on to it Since it sellers faster as FBA I’ll start selling more sooner that way\n', ' \nI have a good bit for me of FBM and it’s not selling well I had been doing well with FBA but my inventory is running out I know from experience that as soon as I can ship and get listed my present FBM as FBA it will start selling\nI’m guessing Amazon is dealing with a lot of new hires maybe lowering their hiring standards as well as dealing with employee descent which is affecting FBA intake performance\n', ' \nYes receiving may be very slow right now My 200pcs was delivered on Monday 518 and Amazon has yet to start receiving my inventory on 523\nSounds like it depends on where you’re sending it though\n', '']
Category: Fulfillment By Amazon


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\'\“\”\’\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
#STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    #text = ''.join(word for word in text if word not in punct)  # remove punctuation
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
test_df['post'] = test_df['post'].apply(clean_text)


def test_print_plot(index):
    example = test_df[test_df.index == index][['post', 'Category']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Category:', example[1])
test_print_plot(10)

book_corral first fba shipment since pandemic now that we re living under  pandemic  rules can i ship anything to fba fulfillment
will i be alerted while making an fba shipment
i don t want to ship stuff in and have amazon tell me they can t take this or that right now and have to deal with returning items or whatever
can i just start an fba shipment and the system will take care of everything   \nnow that we re living under  pandemic  rules can i ship anything to fba fulfillment\nwill i be alerted while making an fba shipment\ni don t want to ship stuff in and have amazon tell me they can t take this or that right now and have to deal with returning items or whatever\ncan i just start an fba shipment and the system will take care of everything\n     \nyes already many threads on this topic just search around fba is open for business but quantities severely limited and shipping times very slow may be better to stick with fbm for a while\n     \namazon will either let you make the shipment or not make it you ll figure it out pretty early in the process\nhowever what the won t tell you is how long it may take for your shipment to be received i don t think enough people have shipped in nonessential items yet to have a good idea of how fast receiving is happening but i suspect that the delays are longer than normal on top of that we re still looking at fba orders taking extended times to ship so customers are buying more fbm fbm even getting the bb over fba in some cases\npersonally i m not shipping any books in right now for these reasons as well as the fact that i don t want to go to the ups store to drop off a box not that it s a major issue with all the library booksales cancelled i don t have that many books to list anyway\nif you decide to ship stuff in let us know how it goes\n     \ntry creating a shipment if it allows you create it you should be ok although you ll likely be subject to quantity limits  but as picksbynisha noted check other forum threads  there are scattered anecdotal reports of shipments being lost not properly accounted for etc\n     \ni ve sent three shipments to ftw1 in the last month\nfirst one was a pallet delivered with ltl shipping and from shipment to receiving was six days\nsecond one was a few boxes via ups shipment to receiving was one day\nthird one was a few more boxes via ups shipped five days ago delivered three days ago still not checkedin or receiving\nin the last year my 18 shipments via fedex and ups to ftw have ranged from 2 to 19 days from shipment to receiving typical transit time is 12 days with an average of 6 days so overall things at that particular warehouse for me have been moving at normal to aboveaverage speed\n     \nby  quantities limited  i assume you mean fba is only allowing me to send a limited quantity of some items if this is so how will i know if i m over limit i typically only sell one of each item\ni guess i m wondering if there items i was able to list on amazon which are not allowed to be sent on to fba can i go ahead and make a shipment and ship it and be confidant it will be accepted\ni have limited storage and would rather wait a while for fba to receive my shipment than hold on to it since it sellers faster as fba i ll start selling more sooner that way\n     \ni have a good bit for me of fbm and it s not selling well i had been doing well with fba but my inventory is running out i know from experience that as soon as i can ship and get listed my present fbm as fba it will start selling\ni m guessing amazon is dealing with a lot of new hires maybe lowering their hiring standards as well as dealing with employee descent which is affecting fba intake performance\n     \nyes receiving may be very slow right now my 200pcs was delivered on monday 518 and amazon has yet to start receiving my inventory on 523\nsounds like it depends on where you re sending it though\n      
Category: Fulfillment By Amazon


test_df['post'].apply(lambda x: len(x.split(' '))).sum()

5303539


5303539 - 4986021

317518


X = test_df.post
y = test_df.Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)


X_train.head()

3885    plankton_games issue with fba orders damaged i...
7710    funnyshirts amazon automatically generating re...
7445    point_bob_books photos added to offer section ...
4301    riellesells seller account at risk of deactiva...
605     best_us_book_deals newbie and book restriction...
Name: post, dtype: object


y_train.head()

3885    Fulfillment By Amazon
7710            Amazon Custom
7445        Selling on Amazon
4301           Account Health
605         Selling on Amazon
Name: Category, dtype: object


from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)


from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res1311 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.46596194503171245
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.77      0.95      0.85       439
                                 Selling on Amazon       0.00      0.00      0.00        78
              Amazon Marketplace Web Service (MWS)       0.00      0.00      0.00        95
                         Amazon Sponsored Products       0.00      0.00      0.00       164
                                    Account Health       0.00      0.00      0.00        99
                                    Global Selling       0.89      0.24      0.38       447
                                        Amazon Pay       0.00      0.00      0.00       201
                                            Groups       0.00      0.00      0.00       137
Health,Safety,Sustainability,Security & Compliance       0.00      0.00      0.00        13
                                 Login With Amazon       0.00      0.00      0.00        56
                                     Amazon Custom       0.34      0.90      0.49       636

                                          accuracy                           0.47      2365
                                         macro avg       0.18      0.19      0.16      2365
                                      weighted avg       0.40      0.47      0.36      2365

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)


from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res1321 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.735306553911205
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.77      0.95      0.85       439
                                 Selling on Amazon       0.78      0.49      0.60        78
              Amazon Marketplace Web Service (MWS)       0.77      0.64      0.70        95
                         Amazon Sponsored Products       0.74      0.68      0.71       164
                                    Account Health       0.89      0.86      0.87        99
                                    Global Selling       0.70      0.87      0.77       447
                                        Amazon Pay       0.78      0.29      0.43       201
                                            Groups       0.84      0.74      0.79       137
Health,Safety,Sustainability,Security & Compliance       0.00      0.00      0.00        13
                                 Login With Amazon       0.79      0.80      0.80        56
                                     Amazon Custom       0.68      0.68      0.68       636

                                          accuracy                           0.74      2365
                                         macro avg       0.70      0.64      0.65      2365
                                      weighted avg       0.74      0.74      0.72      2365

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)


from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict (X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res1331 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)

accuracy 0.7276955602536997
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.86      0.92      0.89       439
                                 Selling on Amazon       0.72      0.49      0.58        78
              Amazon Marketplace Web Service (MWS)       0.80      0.74      0.77        95
                         Amazon Sponsored Products       0.70      0.70      0.70       164
                                    Account Health       0.90      0.76      0.82        99
                                    Global Selling       0.72      0.77      0.74       447
                                        Amazon Pay       0.62      0.42      0.50       201
                                            Groups       0.85      0.73      0.79       137
Health,Safety,Sustainability,Security & Compliance       0.40      0.15      0.22        13
                                 Login With Amazon       0.80      0.71      0.75        56
                                     Amazon Custom       0.62      0.71      0.66       636

                                          accuracy                           0.73      2365
                                         macro avg       0.73      0.64      0.68      2365
                                      weighted avg       0.73      0.73      0.72      2365


from sklearn.tree import DecisionTreeClassifier

dtree = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', DecisionTreeClassifier(random_state=0)),
               ])
dtree.fit(X_train, y_train)

y_pred = dtree.predict (X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res1341 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.5754756871035941
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.76      0.74      0.75       439
                                 Selling on Amazon       0.39      0.31      0.34        78
              Amazon Marketplace Web Service (MWS)       0.59      0.53      0.56        95
                         Amazon Sponsored Products       0.52      0.51      0.51       164
                                    Account Health       0.68      0.63      0.65        99
                                    Global Selling       0.60      0.63      0.61       447
                                        Amazon Pay       0.39      0.35      0.37       201
                                            Groups       0.66      0.62      0.64       137
Health,Safety,Sustainability,Security & Compliance       0.13      0.15      0.14        13
                                 Login With Amazon       0.77      0.64      0.70        56
                                     Amazon Custom       0.49      0.53      0.51       636

                                          accuracy                           0.58      2365
                                         macro avg       0.54      0.51      0.53      2365
                                      weighted avg       0.58      0.58      0.58      2365


import pandas as pd

results = pd.DataFrame({'Model': ['Naive Bayes MultinomialNB', 'Linear SVM', 'Logistic Regression', 'Decision Tree'],
                         'Accuracy': [res1311, res1321, res1331, res1341]})
results.set_index('Model')
results.sort_values(by='Accuracy')


from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
cv_res_sgd = cross_val_score(sgd, X_train, y_train, cv=10)
mean_cv_res_sgd = np.mean(cv_res_sgd)


print(mean_cv_res_sgd)

0.7373282973519802


test_df2 = test_df.copy()


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\'\“\”\’\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    #text = ''.join(word for word in text if word not in punct)  # remove punctuation
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
test_df2['post'] = test_df2['post'].apply(clean_text)


def test_print_plot(index):
    example = test_df2[test_df2.index == index][['post', 'Category']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Category:', example[1])
test_print_plot(10)

book_corral first fba shipment since pandemic living pandemic rules ship anything fba fulfillment alerted making fba shipment want ship stuff amazon tell take right deal returning items whatever start fba shipment system take care everything \nnow living pandemic rules ship anything fba fulfillment\nwill alerted making fba shipment\ni want ship stuff amazon tell take right deal returning items whatever\ncan start fba shipment system take care everything\n \nyes already many threads topic search around fba open business quantities severely limited shipping times slow may better stick fbm while\n \namazon either let make shipment make figure pretty early process\nhowever tell long may take shipment received think enough people shipped nonessential items yet good idea fast receiving happening suspect delays longer normal top still looking fba orders taking extended times ship customers buying fbm fbm even getting bb fba cases\npersonally shipping books right reasons well fact want go ups store drop box major issue library booksales cancelled many books list anyway\nif decide ship stuff let us know goes\n \ntry creating shipment allows create ok although likely subject quantity limits picksbynisha noted check forum threads scattered anecdotal reports shipments lost properly accounted etc\n \ni sent three shipments ftw1 last month\nfirst one pallet delivered ltl shipping shipment receiving six days\nsecond one boxes via ups shipment receiving one day\nthird one boxes via ups shipped five days ago delivered three days ago still checkedin receiving\nin last year 18 shipments via fedex ups ftw ranged 2 19 days shipment receiving typical transit time 12 days average 6 days overall things particular warehouse moving normal aboveaverage speed\n \nby quantities limited assume mean fba allowing send limited quantity items know limit typically sell one item\ni guess wondering items able list amazon allowed sent fba go ahead make shipment ship confidant accepted\ni limited storage would rather wait fba receive shipment hold since sellers faster fba start selling sooner way\n \ni good bit fbm selling well well fba inventory running know experience soon ship get listed present fbm fba start selling\ni guessing amazon dealing lot new hires maybe lowering hiring standards well dealing employee descent affecting fba intake performance\n \nyes receiving may slow right 200pcs delivered monday 518 amazon yet start receiving inventory 523\nsounds like depends sending though\n
Category: Fulfillment By Amazon


X = test_df2.post
y = test_df2.Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)


X_train.head()

3885    plankton_games issue fba orders damaged shippi...
7710    funnyshirts amazon automatically generating re...
7445    point_bob_books photos added offer section gen...
4301    riellesells seller account risk deactivation h...
605     best_us_book_deals newbie book restrictions se...
Name: post, dtype: object


y_train.head()

3885    Fulfillment By Amazon
7710            Amazon Custom
7445        Selling on Amazon
4301           Account Health
605         Selling on Amazon
Name: Category, dtype: object


from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)


from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res2311 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.4904862579281184
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.73      0.96      0.83       439
                                 Selling on Amazon       0.00      0.00      0.00        78
              Amazon Marketplace Web Service (MWS)       0.00      0.00      0.00        95
                         Amazon Sponsored Products       0.00      0.00      0.00       164
                                    Account Health       0.00      0.00      0.00        99
                                    Global Selling       0.87      0.42      0.56       447
                                        Amazon Pay       0.00      0.00      0.00       201
                                            Groups       1.00      0.01      0.01       137
Health,Safety,Sustainability,Security & Compliance       0.00      0.00      0.00        13
                                 Login With Amazon       0.00      0.00      0.00        56
                                     Amazon Custom       0.35      0.87      0.50       636

                                          accuracy                           0.49      2365
                                         macro avg       0.27      0.20      0.17      2365
                                      weighted avg       0.45      0.49      0.40      2365

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)


from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res2321 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.7441860465116279
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.78      0.96      0.86       439
                                 Selling on Amazon       0.78      0.50      0.61        78
              Amazon Marketplace Web Service (MWS)       0.78      0.66      0.72        95
                         Amazon Sponsored Products       0.72      0.70      0.71       164
                                    Account Health       0.89      0.86      0.87        99
                                    Global Selling       0.71      0.86      0.78       447
                                        Amazon Pay       0.75      0.30      0.43       201
                                            Groups       0.85      0.77      0.81       137
Health,Safety,Sustainability,Security & Compliance       0.00      0.00      0.00        13
                                 Login With Amazon       0.81      0.79      0.80        56
                                     Amazon Custom       0.69      0.69      0.69       636

                                          accuracy                           0.74      2365
                                         macro avg       0.71      0.65      0.66      2365
                                      weighted avg       0.74      0.74      0.73      2365

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)


from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict (X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res2331 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)

accuracy 0.7281183932346723
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.88      0.93      0.90       439
                                 Selling on Amazon       0.73      0.46      0.57        78
              Amazon Marketplace Web Service (MWS)       0.76      0.74      0.75        95
                         Amazon Sponsored Products       0.68      0.67      0.68       164
                                    Account Health       0.90      0.71      0.79        99
                                    Global Selling       0.73      0.77      0.75       447
                                        Amazon Pay       0.60      0.44      0.51       201
                                            Groups       0.87      0.72      0.78       137
Health,Safety,Sustainability,Security & Compliance       0.67      0.15      0.25        13
                                 Login With Amazon       0.82      0.71      0.76        56
                                     Amazon Custom       0.62      0.72      0.67       636

                                          accuracy                           0.73      2365
                                         macro avg       0.75      0.64      0.67      2365
                                      weighted avg       0.73      0.73      0.72      2365


from sklearn.tree import DecisionTreeClassifier

dtree = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', DecisionTreeClassifier(random_state=0)),
               ])
dtree.fit(X_train, y_train)

y_pred = dtree.predict (X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
res2341 = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.5826638477801268
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.80      0.77      0.78       439
                                 Selling on Amazon       0.35      0.38      0.37        78
              Amazon Marketplace Web Service (MWS)       0.61      0.54      0.57        95
                         Amazon Sponsored Products       0.52      0.51      0.51       164
                                    Account Health       0.76      0.72      0.74        99
                                    Global Selling       0.60      0.60      0.60       447
                                        Amazon Pay       0.35      0.33      0.34       201
                                            Groups       0.71      0.69      0.70       137
Health,Safety,Sustainability,Security & Compliance       0.05      0.08      0.06        13
                                 Login With Amazon       0.70      0.66      0.68        56
                                     Amazon Custom       0.50      0.54      0.52       636

                                          accuracy                           0.58      2365
                                         macro avg       0.54      0.53      0.53      2365
                                      weighted avg       0.59      0.58      0.59      2365


from sklearn.metrics import confusion_matrix
import seaborn as sns

conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=my_categories, yticklabels=my_categories)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


import pandas as pd

results = pd.DataFrame({'Model': ['Naive Bayes MultinomialNB', 'Linear SVM', 'Logistic Regression', 'Decision Tree'],
                         'Accuracy': [res2311, res2321, res2331, res2341]})
results.set_index('Model')
results.sort_values(by='Accuracy')


from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
cv_res_sgd2 = cross_val_score(sgd, X_train, y_train, cv=10)
mean_cv_res_sgd2 = np.mean(cv_res_sgd2)


print(mean_cv_res_sgd2)

0.7389693155220105


test_df3 = test_df2.copy()


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\'\“\”\’\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = text.replace(r'\n', ' ')
    #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    #text = ''.join(word for word in text if word not in punct)  # remove punctuation
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
test_df3['post'] = test_df3['post'].apply(clean_text)


def test_print_plot(index):
    example = test_df3[test_df3.index == index][['post', 'Category']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Category:', example[1])
test_print_plot(10)

book_corral first fba shipment since pandemic living pandemic rules ship anything fba fulfillment alerted making fba shipment want ship stuff amazon tell take right deal returning items whatever start fba shipment system take care everything living pandemic rules ship anything fba fulfillment alerted making fba shipment want ship stuff amazon tell take right deal returning items whatever start fba shipment system take care everything yes already many threads topic search around fba open business quantities severely limited shipping times slow may better stick fbm amazon either let make shipment make figure pretty early process however tell long may take shipment received think enough people shipped nonessential items yet good idea fast receiving happening suspect delays longer normal top still looking fba orders taking extended times ship customers buying fbm fbm even getting bb fba cases personally shipping books right reasons well fact want go ups store drop box major issue library booksales cancelled many books list anyway decide ship stuff let us know goes try creating shipment allows create ok although likely subject quantity limits picksbynisha noted check forum threads scattered anecdotal reports shipments lost properly accounted etc sent three shipments ftw1 last month first one pallet delivered ltl shipping shipment receiving six days second one boxes via ups shipment receiving one day third one boxes via ups shipped five days ago delivered three days ago still checkedin receiving last year 18 shipments via fedex ups ftw ranged 2 19 days shipment receiving typical transit time 12 days average 6 days overall things particular warehouse moving normal aboveaverage speed quantities limited assume mean fba allowing send limited quantity items know limit typically sell one item guess wondering items able list amazon allowed sent fba go ahead make shipment ship confidant accepted limited storage would rather wait fba receive shipment hold since sellers faster fba start selling sooner way good bit fbm selling well well fba inventory running know experience soon ship get listed present fbm fba start selling guessing amazon dealing lot new hires maybe lowering hiring standards well dealing employee descent affecting fba intake performance yes receiving may slow right 200pcs delivered monday 518 amazon yet start receiving inventory 523 sounds like depends sending though
Category: Fulfillment By Amazon


X = test_df3.post
y = test_df3.Category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)


from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
cv_res_sgd3 = cross_val_score(sgd, X_train, y_train, cv=10)
mean_cv_res_sgd3 = np.mean(cv_res_sgd3)


print(mean_cv_res_sgd3)

0.7418685879209101


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=500, max_depth=500, random_state=0)),
               ])

cv_res_rf = cross_val_score(rf, X_train, y_train, cv=10)
mean_cv_res_rf = np.mean(cv_res_rf)


print(mean_cv_res_rf)#max_depth = 10, n_estimators=300

0.4819534394722039


print(mean_cv_res_rf)# max_depth = 100, n_estimators=300

0.679733161577614


print(mean_cv_res_rf)# max_depth = 500, n_estimators=300

0.6837375625515002


##### BEST #####
print(mean_cv_res_rf)# max_depth = 500, n_estimators=400 
##### BEST #####

0.6862702788461644


print(mean_cv_res_rf)# max_depth = 1000, n_estimators=400

0.6862702788461644


print(mean_cv_res_rf)# max_depth = 500, n_estimators=500

0.6860729824560556


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

rf = Pipeline([('vect', TfidfVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=200, max_depth=100, random_state=0)),
               ])

cv_res_rf_tfidf = cross_val_score(rf, X_train, y_train, cv=10)
mean_cv_res_rf_tfidf = np.mean(cv_res_rf_tfidf)


mean_cv_res_rf_tfidf# max_depth = 500, n_estimators=500

0.6897103851566124


mean_cv_res_rf_tfidf # n_estimators=200, max_depth=100

0.6817362730564369


from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

sgd = Pipeline([('vect', TfidfVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=100, tol=None)),
               ])
cv_res_sgd_tfidf = cross_val_score(sgd, X_train, y_train, cv=10)
mean_cv_res_sgd_tfidf = np.mean(cv_res_sgd_tfidf)


mean_cv_res_sgd_tfidf #10 in max_iter

0.7371562662790911


mean_cv_res_sgd_tfidf# 100 in max_iter

0.7364230632814868


!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/be/87/aa2f2e55ea6ea2098ae191894488430162fe04b0fa89ac06dc923228a8d8/xgboost-1.1.1-py3-none-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (1.1MB)
     |████████████████████████████████| 1.1MB 276kB/s eta 0:00:01
Requirement already satisfied: scipy in /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages (from xgboost) (1.3.2)
Requirement already satisfied: numpy in /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages (from xgboost) (1.17.3)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1


import xgboost as xgb

xgb = Pipeline(steps=[('vect', TfidfVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('xgboost', xgb.XGBClassifier(objective='multi:softmax'))])

cv_res_xgb_tfidf = cross_val_score(xgb, X_train, y_train, cv=10)
mean_cv_res_xgb_tfidf = np.mean(cv_res_xgb_tfidf)


mean_cv_res_xgb_tfidf

0.740596360201003


xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)


print('accuracy %s' % accuracy_score(predicted_LGBM, y_test))
res_xgb = accuracy_score(y_pred, y_test)
print(classification_report(y_test, y_pred,target_names=my_categories))

accuracy 0.7517970401691332
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.90      0.90      0.90       439
                                 Selling on Amazon       0.74      0.51      0.61        78
              Amazon Marketplace Web Service (MWS)       0.78      0.68      0.73        95
                         Amazon Sponsored Products       0.74      0.68      0.71       164
                                    Account Health       0.91      0.79      0.84        99
                                    Global Selling       0.75      0.80      0.78       447
                                        Amazon Pay       0.66      0.42      0.51       201
                                            Groups       0.92      0.72      0.81       137
Health,Safety,Sustainability,Security & Compliance       0.00      0.00      0.00        13
                                 Login With Amazon       0.85      0.73      0.79        56
                                     Amazon Custom       0.61      0.76      0.68       636

                                          accuracy                           0.74      2365
                                         macro avg       0.72      0.64      0.67      2365
                                      weighted avg       0.75      0.74      0.74      2365

/opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)


conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=my_categories, yticklabels=my_categories)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/21/d1/7773d81964183f6892f71cf43b92f90d0bb8c954c05651d5071a2b480420/lightgbm-2.3.1-py2.py3-none-macosx_10_9_x86_64.macosx_10_10_x86_64.macosx_10_11_x86_64.macosx_10_12_x86_64.macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl (679kB)
     |████████████████████████████████| 686kB 290kB/s eta 0:00:01
Requirement already satisfied: scipy in /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages (from lightgbm) (1.3.2)
Requirement already satisfied: scikit-learn in /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages (from lightgbm) (0.21.3)
Requirement already satisfied: numpy in /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages (from lightgbm) (1.17.3)
Requirement already satisfied: joblib>=0.11 in /opt/anaconda3/envs/deep-learning/lib/python3.6/site-packages (from scikit-learn->lightgbm) (0.14.0)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


import lightgbm as lgbm

tfidf_vec = TfidfVectorizer(dtype=np.float32, sublinear_tf=True, use_idf=True, smooth_idf=True)
X_data_tfidf = tfidf_vec.fit_transform(test_df3['post'])
X_train_tfidf = tfidf_vec.transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

clf_LGBM = lgbm.LGBMClassifier(objective='multiclass', verbose=-1, learning_rate=0.5, max_depth=20, num_leaves=50, n_estimators=120, max_bin=2000,)
clf_LGBM.fit(X_train_tfidf, y_train, verbose=-1)
predicted_LGBM = clf_LGBM.predict(X_test_tfidf)


cv_res_lgbm_tfidf = cross_val_score(clf_LGBM, X_train_tfidf, y_train, cv=10)
mean_cv_res_lgbm_tfidf = np.mean(cv_res_lgbm_tfidf)


mean_cv_res_lgbm_tfidf

0.7036528977483523


print('accuracy %s' % accuracy_score(predicted_LGBM, y_test))
res_lgbm = accuracy_score(predicted_LGBM, y_test)
print(classification_report(y_test, predicted_LGBM,target_names=my_categories))

accuracy 0.7517970401691332
                                                    precision    recall  f1-score   support

                             Fulfillment By Amazon       0.89      0.92      0.90       439
                                 Selling on Amazon       0.73      0.47      0.57        78
              Amazon Marketplace Web Service (MWS)       0.80      0.72      0.76        95
                         Amazon Sponsored Products       0.74      0.69      0.72       164
                                    Account Health       0.88      0.78      0.82        99
                                    Global Selling       0.76      0.80      0.78       447
                                        Amazon Pay       0.68      0.44      0.54       201
                                            Groups       0.92      0.71      0.80       137
Health,Safety,Sustainability,Security & Compliance       0.25      0.08      0.12        13
                                 Login With Amazon       0.93      0.77      0.84        56
                                     Amazon Custom       0.63      0.77      0.69       636

                                          accuracy                           0.75      2365
                                         macro avg       0.75      0.65      0.69      2365
                                      weighted avg       0.76      0.75      0.75      2365


conf_mat = confusion_matrix(y_test, predicted_LGBM)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=my_categories, yticklabels=my_categories)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


data = pd.read_csv('minClean_amazon.csv')


STOPWORDS = set(stopwords.words('english'))
STOPWORDSv2 = [word.upper() for word in STOPWORDS]


STOPWORDSv2

["SHE'S",
 'WASN',
 'IN',
 'I',
 'AGAINST',
 'OUR',
 'O',
 "HADN'T",
 'AT',
 'HERE',
 'YOURSELF',
 'SUCH',
 'OWN',
 'MY',
 'COULDN',
 'WHOM',
 'HE',
 'WILL',
 "SHAN'T",
 'NOT',
 'SAME',
 'NO',
 'RE',
 'BEFORE',
 'SHOULD',
 'HASN',
 "WEREN'T",
 'FEW',
 'VE',
 'AN',
 "ISN'T",
 "WON'T",
 'DID',
 'IT',
 "AREN'T",
 'HAS',
 'ABOVE',
 'SHOULDN',
 'ARE',
 'FURTHER',
 'BUT',
 'UNDER',
 'HIMSELF',
 "NEEDN'T",
 'THERE',
 'WHY',
 'TO',
 "MIGHTN'T",
 'WHEN',
 'OF',
 "DIDN'T",
 'HERSELF',
 'THEIR',
 "SHOULD'VE",
 'T',
 'D',
 'THEY',
 'WHICH',
 'BEEN',
 'THE',
 'THEN',
 "WASN'T",
 'SHE',
 'ON',
 'TOO',
 'VERY',
 'BEING',
 'HAVEN',
 'LL',
 'DON',
 'DOWN',
 'BECAUSE',
 'ITSELF',
 'AIN',
 'ME',
 'WOULDN',
 'MYSELF',
 'AGAIN',
 'SHAN',
 "HASN'T",
 "YOU'D",
 'IF',
 'OUT',
 'HOW',
 'JUST',
 'AREN',
 'DOING',
 'AS',
 "MUSTN'T",
 'THESE',
 'CAN',
 'HERS',
 'THEM',
 'ALL',
 "COULDN'T",
 'HAD',
 'THAN',
 'OTHER',
 'MIGHTN',
 'BETWEEN',
 'WE',
 'THAT',
 'DURING',
 'BE',
 'DOES',
 "HAVEN'T",
 'ISN',
 'IS',
 'AM',
 'HAVE',
 'INTO',
 'HIS',
 'HIM',
 "WOULDN'T",
 'ITS',
 'EACH',
 'MA',
 'WEREN',
 'MOST',
 'AFTER',
 'AND',
 'UP',
 'OFF',
 'DOESN',
 'SO',
 'THEMSELVES',
 'OR',
 'WHILE',
 'HAVING',
 'BOTH',
 "YOU'RE",
 'BY',
 'NOR',
 'WERE',
 "YOU'VE",
 'Y',
 'YOU',
 'THEIRS',
 'ONCE',
 'WITH',
 'NEEDN',
 'BELOW',
 'WHO',
 'DO',
 'HER',
 'ABOUT',
 'YOUR',
 'MUSTN',
 'M',
 'SOME',
 'OURS',
 'FROM',
 'OURSELVES',
 'MORE',
 'A',
 'THROUGH',
 'S',
 "DON'T",
 'DIDN',
 'NOW',
 "SHOULDN'T",
 'THOSE',
 'OVER',
 'WON',
 "IT'S",
 'WHERE',
 "DOESN'T",
 'HADN',
 'ANY',
 "THAT'LL",
 'ONLY',
 'WAS',
 'YOURS',
 'YOURSELVES',
 "YOU'LL",
 'WHAT',
 'THIS',
 'FOR',
 'UNTIL']


def clean_data(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = ' '.join(word for word in text.split() if word not in STOPWORDSv2) # delete stopwors from text
    return text
    
data['Leading Comment'] = data['Leading Comment'].apply(clean_data)


data['Leading Comment'].head()

0                            need to know how to start
1    Hi are there any other seller experiencing del...
2    Hi all So back when the new removaldisposal fe...
3    Im using inventorylab to upload my inventory t...
4    recently listed 100 books for sale on Amazon 2...
Name: Leading Comment, dtype: object


upcase_words = {}
regex = r"([A-Z]{3})"

for i, row in zip(range(500), data['Leading Comment'][:500]):
    row = data['Leading Comment'].str.findall(regex)[i]
    if len(row) != 0:
        upcase_words[i] = row


df_upword = pd.DataFrame(upcase_words.items(), columns=['Original Index', 'Upper Case Words'])


df_upword.head()


df_upword['Number of UpCase Words'] =  df_upword['Upper Case Words'].apply(lambda x: len(x))


df_upword.head()


df_upword['Number of UpCase Words'].values.max()

61


df_upword['Number of UpCase Words'][df_upword['Number of UpCase Words'].values == 61]

130    61
Name: Number of UpCase Words, dtype: int64


df_upword['Number of UpCase Words'][130]

61


df_upword['Upper Case Words'][130]

['FBA',
 'OHI',
 'FBA',
 'FBA',
 'ORI',
 'GIN',
 'CAS',
 'URL',
 'FBA',
 'FBA',
 'FBA',
 'URL',
 'FBA',
 'FBA',
 'FBA',
 'URL',
 'ASA',
 'CAS',
 'URL',
 'ASA',
 'FBA',
 'FBA',
 'FBA',
 'FBA',
 'FBA',
 'URL',
 'FBA',
 'URL',
 'URL',
 'FBA',
 'URL',
 'FBA',
 'FBA',
 'FNS',
 'FBA',
 'CAS',
 'URL',
 'WAY',
 'GET',
 'HEL',
 'FBA',
 'FBR',
 'URL',
 'URL',
 'FBA',
 'ORI',
 'GIN',
 'CAS',
 'URL',
 'URL',
 'CAS',
 'TEL',
 'WAI',
 'TIN',
 'CHE',
 'CKE',
 'FBA',
 'FBA',
 'FBA',
 'FBA',
 'DDB']


abb = {}
for word in df_upword['Upper Case Words'][130]:
    if word in abb.keys():
        abb[word] += 1
    else:
        abb[word] = 1

abb

{'FBA': 25,
 'OHI': 1,
 'ORI': 2,
 'GIN': 2,
 'CAS': 5,
 'URL': 13,
 'ASA': 2,
 'FNS': 1,
 'WAY': 1,
 'GET': 1,
 'HEL': 1,
 'FBR': 1,
 'TEL': 1,
 'WAI': 1,
 'TIN': 1,
 'CHE': 1,
 'CKE': 1,
 'DDB': 1}

	Unnamed: 0	Link	Publish Time	Reply Times	Reply Authors
0	0	http://sellercentral.amazon.com/forums/t/2-box...	2020-05-26 17:33:26+00:00	[Timestamp('2020-05-26 17:48:57+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
1	1	http://sellercentral.amazon.com/forums/t/fba-s...	2020-05-26 15:28:17+00:00	[Timestamp('2020-05-26 16:03:50+0000', tz='UTC')]	<function get_reply_authors at 0x00000119FBB60...
2	2	http://sellercentral.amazon.com/forums/t/fba-r...	2020-05-26 05:43:46+00:00	[]	<function get_reply_authors at 0x00000119FBB60...
3	3	http://sellercentral.amazon.com/forums/t/am-i-...	2020-05-25 16:53:40+00:00	[Timestamp('2020-05-25 17:14:44+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
4	4	http://sellercentral.amazon.com/forums/t/stran...	2020-05-24 02:14:21+00:00	[Timestamp('2020-05-25 01:42:41+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
...	...	...	...	...	...
7877	7884	http://sellercentral.amazon.com/forums/t/amazo...	2016-02-24 16:00:39+00:00	[Timestamp('2016-02-24 16:04:39+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
7878	7885	http://sellercentral.amazon.com/forums/t/just-...	2016-02-08 02:55:01+00:00	[Timestamp('2016-02-08 03:08:13+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
7879	7886	http://sellercentral.amazon.com/forums/t/amazo...	2015-12-07 03:47:09+00:00	[Timestamp('2016-01-11 15:24:01+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
7880	7887	http://sellercentral.amazon.com/forums/t/custo...	2015-12-30 20:15:50+00:00	[Timestamp('2016-01-03 21:29:26+0000', tz='UTC...	<function get_reply_authors at 0x00000119FBB60...
7881	7888	http://sellercentral.amazon.com/forums/t/amazo...	2015-12-31 18:30:54+00:00	[Timestamp('2018-02-23 17:20:48+0000', tz='UTC')]	<function get_reply_authors at 0x00000119FBB60...

	Original Index	Upper Case Words
0	1	[FBA]
1	6	[MDW]
2	8	[FBA, FBM, FBA, FBM, FBA]
3	9	[FNS, FBA]
4	10	[FBA, FBA, FBA]

	Original Index	Upper Case Words	Number of UpCase Words
0	1	[FBA]	1
1	6	[MDW]	1
2	8	[FBA, FBM, FBA, FBM, FBA]	5
3	9	[FNS, FBA]	2
4	10	[FBA, FBA, FBA]	3

Loading the data¶

Strategy 1¶

Explore the data¶

Text Pre-processing¶

Modeling the data¶

Naive Bayes Classifier for Multinomial Models¶

CountVectorizer + TF-IDFTransformer + MultinomialNB¶

Linear Support Vector Machine¶

CountVectorizer + TF-IDFTransformer + SGDClassifier¶

Logistic Regression¶

CountVectorizer + TF-IDFTransformer + Logistic Regression¶

Decision Tree¶

CountVectorizer + TF-IDFTransformer + DecisionTreeClassifier¶

Results¶

Results of the previously trained models¶

Cross Validation with linear SVM¶

Strategy 2¶

Modeling the data¶

Naive Bayes Classifier for Multinomial Models¶

CountVectorizer + TF-IDFTransformer + MultinomialNB¶

Linear Support Vector Machine¶

CountVectorizer + TF-IDFTransformer + SGDClassifier¶

Logistic Regression¶

CountVectorizer + TF-IDFTransformer + Logistic Regression¶

Decision Tree¶

CountVectorizer + TF-IDFTransformer + DecisionTreeClassifier¶

Results¶

Results of the previously trained models¶

Cross Validation with linear SVM¶

Strategy 3¶

Cross Validation with linear SVM¶

Random Forest¶

XGBoost¶

Light GBM¶

Other: Investigating Abbreviations¶

	Unnamed: 0	Link	Title	Category	Post Author	Leading Comment	Publish Time	Reply Authors	Reply Comments	Reply Times
0	0	http://sellercentral.amazon.com/forums/t/2-box...	2 boxes of books	Fulfillment By Amazon	Earth_Light_Books	I need to know how to start	2020-05-26 17:33:26+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nI need to know how to start\n', ' \nAmazon...	[Timestamp('2020-05-26 17:48:57+0000', tz='UTC...
1	1	http://sellercentral.amazon.com/forums/t/fba-s...	FBA Shipment Delivered Delayed Check in	Fulfillment By Amazon	HOUSE_OF_LORDS_RODEO	Hi are there any other seller experiencing del...	2020-05-26 15:28:17+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nHi are there any other seller experiencing...	[Timestamp('2020-05-26 16:03:50+0000', tz='UTC')]
2	2	http://sellercentral.amazon.com/forums/t/fba-r...	FBA removal/disposal fees	Fulfillment By Amazon	Texastoys	Hi all\nSo back when the new removaldisposal f...	2020-05-26 05:43:46+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nHi all\nSo back when the new removaldispos...	[]
3	3	http://sellercentral.amazon.com/forums/t/am-i-...	Am i able to send in FBA inventory now?	Fulfillment By Amazon	Assuranceproducts	Im using inventorylab to upload my inventory t...	2020-05-25 16:53:40+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nIm using inventorylab to upload my invento...	[Timestamp('2020-05-25 17:14:44+0000', tz='UTC...
4	4	http://sellercentral.amazon.com/forums/t/stran...	Stranded Inventory	Fulfillment By Amazon	BillsBuys	I recently listed 100 books for sale on Amazon...	2020-05-24 02:14:21+00:00	<function get_reply_authors at 0x00000119FBB60...	['\nI recently listed 100 books for sale on Am...	[Timestamp('2020-05-25 01:42:41+0000', tz='UTC...

	Model	Accuracy
0	Naive Bayes MultinomialNB	0.465962
3	Decision Tree	0.575476
2	Logistic Regression	0.727696
1	Linear SVM	0.735307

	Model	Accuracy
0	Naive Bayes MultinomialNB	0.490486
3	Decision Tree	0.582664
2	Logistic Regression	0.728118
1	Linear SVM	0.744186