import numpy as np
import pandas as pd
yelp = pd.read_csv('yelp.csv')
yelp.head()
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 9yKzy9PApeiPPOUJEtnvkg | 2011-01-26 | fWKvX83p0-ka4JS3dc6E5A | 5 | My wife took me here on my birthday for breakf... | review | rLtl8ZkDX5vH5nAx9C3q5Q | 2 | 5 | 0 |
1 | ZRJwVLyzEJq1VAihDhYiow | 2011-07-27 | IjZ33sJrzXqU-0X6U8NwyA | 5 | I have no idea why some people give bad review... | review | 0a2KyEL0d3Yb1V6aivbIuQ | 0 | 0 | 0 |
2 | 6oRAC4uyJCsJl1X0WZpVSA | 2012-06-14 | IESLBzqUCLdSzSqm0eCSxQ | 4 | love the gyro plate. Rice is so good and I als... | review | 0hT2KtfLiobPvh6cDC8JQg | 0 | 1 | 0 |
3 | _1QQZuf4zZOyFCvXc0o6Vg | 2010-05-27 | G-WvGaISbqqaMHlNnByodA | 5 | Rosie, Dakota, and I LOVE Chaparral Dog Park!!... | review | uZetl9T0NcROGOyFfughhg | 1 | 2 | 0 |
4 | 6ozycU1RpktNG2-1BroVtw | 2012-01-05 | 1uJFq2r5QfJG_6ExMRCaGw | 5 | General Manager Scott Petello is a good egg!!!... | review | vYmM4KTsC8ZfQBg-j5MWkw | 0 | 0 | 0 |
yelp.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 business_id 10000 non-null object 1 date 10000 non-null object 2 review_id 10000 non-null object 3 stars 10000 non-null int64 4 text 10000 non-null object 5 type 10000 non-null object 6 user_id 10000 non-null object 7 cool 10000 non-null int64 8 useful 10000 non-null int64 9 funny 10000 non-null int64 dtypes: int64(4), object(6) memory usage: 781.4+ KB
yelp.describe()
stars | cool | useful | funny | |
---|---|---|---|---|
count | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 |
mean | 3.777500 | 0.876800 | 1.409300 | 0.701300 |
std | 1.214636 | 2.067861 | 2.336647 | 1.907942 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 4.000000 | 0.000000 | 1.000000 | 0.000000 |
75% | 5.000000 | 1.000000 | 2.000000 | 1.000000 |
max | 5.000000 | 77.000000 | 76.000000 | 57.000000 |
yelp['text length'] = yelp['text'].apply(len)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
g = sns.FacetGrid(yelp,col='stars')
g.map(plt.hist,'text length')
C:\Users\ACER\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<seaborn.axisgrid.FacetGrid at 0x23effa5e350>
sns.boxplot(x='stars',y='text length',data=yelp,palette='rainbow')
<Axes: xlabel='stars', ylabel='text length'>
sns.countplot(x='stars',data=yelp,palette='rainbow')
<Axes: xlabel='stars', ylabel='count'>
yelp_class = yelp[(yelp.stars==1) | (yelp.stars==5)]
X = yelp_class['text']
y = yelp_class['stars']
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
predictions = nb.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))
[[159 69] [ 22 976]] precision recall f1-score support 1 0.88 0.70 0.78 228 5 0.93 0.98 0.96 998 accuracy 0.93 1226 macro avg 0.91 0.84 0.87 1226 weighted avg 0.92 0.93 0.92 1226
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('bow', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier
])
X = yelp_class['text']
y = yelp_class['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)
# May take some time
pipeline.fit(X_train,y_train)
Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())])
CountVectorizer()
TfidfTransformer()
MultinomialNB()
predictions = pipeline.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
[[ 0 228] [ 0 998]] precision recall f1-score support 1 0.00 0.00 0.00 228 5 0.81 1.00 0.90 998 accuracy 0.81 1226 macro avg 0.41 0.50 0.45 1226 weighted avg 0.66 0.81 0.73 1226
C:\Users\ACER\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\ACER\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\ACER\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))