多个机器学习模型对在线购物意向数据集的分类

  • 时间:
  • 来源:互联网
  • 文章标签:

多个机器学习模型对 Online Shoppers Purchasing Intention Dataset Data Set数据集的分类

注:数据源网址https://archive.ics.uci.edu/ml/datasets/Online+Shoppers+Purchasing+Intention+Dataset

注:源码地址https://github.com/HY1506698498/-/tree/main/%E8%B4%AD%E7%89%A9%E6%84%8F%E5%90%91

1.KNN

import pandas as pd

df = pd.read_csv('./online_shoppers_intention.csv')
df.head()

df.shape

df.describe()

df.columns

X = df[['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend',]]
y = df['Revenue']

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rfc = RandomForestClassifier(random_state=90)
base_score = cross_val_score(rfc, X, y, cv=10).mean()
base_score

from sklearn.model_selection import GridSearchCV
import numpy as np

rfc = RandomForestClassifier(random_state=90,
                            n_jobs=-1)
param_gird = ({
    'n_estimators': np.arange(1, 201, 10)
})
clf = GridSearchCV(rfc,param_grid=param_gird, cv=10)
clf.fit(X, y)

clf.best_score_, clf.best_params_

from sklearn.model_selection import GridSearchCV
import numpy as np

rfc = RandomForestClassifier(random_state=90,
                            n_jobs=-1)
param_gird = ({
    'n_estimators': np.arange(170, 211, 1)
})
clf = GridSearchCV(rfc,param_grid=param_gird, cv=10)
clf.fit(X, y)

clf.best_score_, clf.best_params_

score1 = []
for i in range(1, 20, 1):
    rfc = RandomForestClassifier(
                          n_estimators=195,
                          n_jobs=-1,
                          random_state=90,
                          max_depth= i
                            )
    score = cross_val_score(rfc, X, y, cv = 10).mean()
    score1.append(score)
    
print(max(score1), score1.index(max(score1))+1)

param_grid = {'min_samples_leaf':np.arange(1, 21, 1)}
rfc = RandomForestClassifier(n_estimators=195,
                                 n_jobs=-1,
                                 random_state= 90,
                                )

clf = GridSearchCV(rfc, param_grid=param_grid, cv=10)
clf.fit(X, y)

clf.best_score_, clf.best_params_

param_grid = {'max_features':np.arange(5,15)} 

rfc = RandomForestClassifier(n_estimators=195,
                                 n_jobs=-1,
                                 random_state=90,
                                )
clf = GridSearchCV(rfc, param_grid=param_grid, cv=10)
clf.fit(X, y)
clf.best_score_, clf.best_params_

param_grid = {'criterion':['gini', 'entropy']}
rfc = RandomForestClassifier(n_estimators=195,
                                 n_jobs=-1,
                                 random_state= 90,
                                 max_features = 1
                                )

gridsearch5 = GridSearchCV(rfc, param_grid=param_grid, cv=10)
gridsearch5.fit(X, y)

gridsearch5.best_score_, gridsearch5.best_params_

rfc = RandomForestClassifier(n_estimators=195,random_state=90, max_features=6, n_jobs=-1)
score = cross_val_score(rfc,X,y,cv=10).mean()
score

2.逻辑回归模型

import pandas as pd

df = pd.read_csv('./online_shoppers_intention.csv')
df.head()

df.describe()

list(df.columns)

X = df[['Administrative','Administrative_Duration','Informational','Informational_Duration','ProductRelated','ProductRelated_Duration',
 'BounceRates','ExitRates','PageValues','SpecialDay','OperatingSystems','Browser','Region','TrafficType']]
y = df['Revenue']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

logi = LogisticRegression()
logi.fit(X_train, y_train)

logi.score(X_test, y_test)

from sklearn.model_selection import GridSearchCV
import numpy as np

reg = Pipeline([('Poly', PolynomialFeatures(degree=3)),
                ('scaler', StandardScaler()),
                ('logi', LogisticRegression())
              ])

param_grid = ({
              'logi__solver': ['lbfgs', 'liblinear'],
              'logi__C': [i for i in np.arange(0.1, 1.2, 10)]
              })

gs_clf = GridSearchCV(reg, param_grid=param_grid)
gs_clf.fit(X_train, y_train)

gs_clf.score(X_test, y_test)

3.AdaBoost

import pandas as pd

df = pd.read_csv('./online_shoppers_intention.csv')
df.head()

df.describe()

cou = df.columns
cou

X = df[['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 
       'Weekend']]
y = df['Revenue']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

from sklearn.ensemble import AdaBoostClassifier

adBC = AdaBoostClassifier()
adBC.fit(X_train, y_train)

adBC.score(X_test, y_test)

from sklearn.tree import DecisionTreeClassifier

clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=9),
                         algorithm='SAMME.R',
                        learning_rate=0.1,
                        random_state=200,
                        n_estimators=50)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)


4.Bagging

import pandas as pd

df = pd.read_csv('./online_shoppers_intention.csv')
df.head()

df.shape

df.describe()

df.columns

X = df[['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend']]
y = df['Revenue']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bgg = BaggingClassifier(DecisionTreeClassifier(random_state=666),
                       n_estimators=38,
                       max_samples=200,
                       random_state=84,
                       bootstrap=True,
                       oob_score=True,
                       n_jobs=-1)
bgg.fit(X, y)

bgg.score(X, y)

5.随机森林

import pandas as pd

df = pd.read_csv('./online_shoppers_intention.csv')
df.head()

df.shape

df.describe()

df.columns

X = df[['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend',]]
y = df['Revenue']

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rfc = RandomForestClassifier(random_state=90)
base_score = cross_val_score(rfc, X, y, cv=10).mean()
base_score

from sklearn.model_selection import GridSearchCV
import numpy as np

rfc = RandomForestClassifier(random_state=90,
                            n_jobs=-1)
param_gird = ({
    'n_estimators': np.arange(1, 201, 10)
})
clf = GridSearchCV(rfc,param_grid=param_gird, cv=10)
clf.fit(X, y)

clf.best_score_, clf.best_params_

from sklearn.model_selection import GridSearchCV
import numpy as np

rfc = RandomForestClassifier(random_state=90,
                            n_jobs=-1)
param_gird = ({
    'n_estimators': np.arange(170, 211, 1)
})
clf = GridSearchCV(rfc,param_grid=param_gird, cv=10)
clf.fit(X, y)

clf.best_score_, clf.best_params_

score1 = []
for i in range(1, 20, 1):
    rfc = RandomForestClassifier(
                          n_estimators=195,
                          n_jobs=-1,
                          random_state=90,
                          max_depth= i
                            )
    score = cross_val_score(rfc, X, y, cv = 10).mean()
    score1.append(score)
    
print(max(score1), score1.index(max(score1))+1)

param_grid = {'min_samples_leaf':np.arange(1, 21, 1)}
rfc = RandomForestClassifier(n_estimators=195,
                                 n_jobs=-1,
                                 random_state= 90,
                                )

clf = GridSearchCV(rfc, param_grid=param_grid, cv=10)
clf.fit(X, y)

clf.best_score_, clf.best_params_

param_grid = {'max_features':np.arange(5,15)} 

rfc = RandomForestClassifier(n_estimators=195,
                                 n_jobs=-1,
                                 random_state=90,
                                )
clf = GridSearchCV(rfc, param_grid=param_grid, cv=10)
clf.fit(X, y)
clf.best_score_, clf.best_params_

param_grid = {'criterion':['gini', 'entropy']}
rfc = RandomForestClassifier(n_estimators=195,
                                 n_jobs=-1,
                                 random_state= 90,
                                 max_features = 1
                                )

gridsearch5 = GridSearchCV(rfc, param_grid=param_grid, cv=10)
gridsearch5.fit(X, y)

gridsearch5.best_score_, gridsearch5.best_params_

rfc = RandomForestClassifier(n_estimators=195,random_state=90, max_features=6, n_jobs=-1)
score = cross_val_score(rfc,X,y,cv=10).mean()
score


本文链接http://www.taodudu.cc/news/show-1781817.html