from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from sparknlp.annotator import *
from sparknlp.base import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from datetime import datetime
import random as rd
sub_com_nlp = spark.read.parquet("/FileStore/sub_com/nlp/")
df_media = spark.read.parquet("/FileStore/sub_com/media_count/")
df_char = spark.read.parquet("/FileStore/sub_com/character_count/")
Technical proposal: By fitting the score for each comments, we could be able to fit the comment features with the comment scores. The interpretation of the model can assist us to understand the major factors on the comments score which we believe is a good way to know the values of Marvel fans. We will try to identify some of the features that will positively affect the score or negatively affect the score. This requires the models we used have a high interpretability.
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
## add extra features and drop unused columns
data_train = sub_com_nlp.withColumn('text_len', length('content'))
data_train = data_train.filter(col('is_submission'))
data_train = data_train.drop('id', 'author', 'created_ts', 'content', 'is_submission')
## encode categorical variavles to integer
data_train_score = data_train.withColumn('sentiment: pos', when(col('sentiment') == 'positive', 1).otherwise(0))\
.withColumn('sentiment: neg', when(col('sentiment') == 'negative', 1).otherwise(0))\
.withColumn('sentiment: neu', when(col('sentiment') == 'neutral', 1).otherwise(0))
data_train_score = data_train_score.drop('sentiment')
## cast other True-False variable
for c in data_train_score.columns:
if len(c.split(':')) == 2:
data_train_score = data_train_score.withColumn(c, col(c).cast('integer'))
## get the feature columns and assemble
feature_ls = [i for i in data_train_score.columns if i != 'score']
assembler = VectorAssembler(inputCols = feature_ls, outputCol = "features")
## construct model randomforest regressor and the pipeline
rf = RandomForestRegressor(labelCol = "score", featuresCol = "features")
pipeline1 = Pipeline(stages=[assembler, rf])
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol = 'score', featuresCol = 'features')
pipeline2 = Pipeline(stages=[assembler, lr])
## hyperparameters tuning and validation
paramGrid_rf = ParamGridBuilder() \
.addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 100, num = 3)]) \
.addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 5, stop = 30, num = 3)]) \
.addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt', 'log2', 'all']) \
.build()
tvs_rf = TrainValidationSplit(estimator=pipeline1,
estimatorParamMaps=paramGrid_rf,
evaluator=RegressionEvaluator(labelCol = 'score'),
trainRatio=0.8)
paramGrid_lr = ParamGridBuilder() \
.addGrid(lr.maxIter, [int(x) for x in np.linspace(start = 5, stop = 15, num = 10)]) \
.addGrid(lr.regParam, [x for x in np.linspace(start = 0, stop = 1, num = 10)]) \
.build()
tvs_lr = TrainValidationSplit(estimator=pipeline2,
estimatorParamMaps=paramGrid_lr,
evaluator=RegressionEvaluator(labelCol = 'score'),
trainRatio=0.8)
## train test split
(trainingData, testData) = data_train_score.randomSplit([0.7, 0.3])
## fit the random forest regressor
rfModel = tvs_rf.fit(trainingData)
## fit the linear regression
lrModel = tvs_lr.fit(trainingData)
## prediction - random forest regression
train_predictions_rf = rfModel.transform(trainingData)
test_predictions_rf = rfModel.transform(testData)
## prediction - linear regression
train_predictions_lr = lrModel.transform(trainingData)
test_predictions_lr = lrModel.transform(testData)
## evaluate - random forest
eval_ls = ['rmse', 'mae']
eval_df_rf = pd.DataFrame([[0,0],[0,0]], columns = eval_ls, index = ['random forest train', 'random forest test'])
for e in eval_ls:
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName=e)
eva_test = evaluator.evaluate(test_predictions_rf)
eva_train = evaluator.evaluate(train_predictions_rf)
eval_df_rf.loc['random forest train',e] = eva_train
eval_df_rf.loc['random forest test', e] = eva_test
## evaluate - linear regression
eval_ls = ['rmse', 'mae']
eval_df_lr = pd.DataFrame([[0,0],[0,0]], columns = eval_ls, index = ['linear regression train', 'linear regression test'])
for e in eval_ls:
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName=e)
eva_test = evaluator.evaluate(test_predictions_lr)
eva_train = evaluator.evaluate(train_predictions_lr)
eval_df_lr.loc['linear regression train',e] = eva_train
eval_df_lr.loc['linear regression test', e] = eva_test
eval_df = pd.concat([eval_df_rf, eval_df_lr])
eval_df
rmse | mae | |
---|---|---|
random forest train | 2204.892606 | 495.574021 |
random forest test | 2266.196969 | 498.906052 |
linear regression train | 2220.796226 | 504.343850 |
linear regression test | 2269.790856 | 503.422917 |
rfModel.save('/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/model/random_forest')
lrModel.save('/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/model/linear_regression')
sns.set_theme(style = 'darkgrid')
bestPipeline = rfModel.bestModel
bestModel = bestPipeline.stages[1]
importances = bestModel.featureImportances
f_imp_dict = dict(zip(feature_ls, np.array(importances)))
f_imp_sort = {k: v for k, v in sorted(f_imp_dict.items(), key=lambda item: item[1], reverse=True)}
imp_top10 = {k: f_imp_sort[k] for k in list(f_imp_sort)[1:16]}
imp_df = pd.DataFrame(data = imp_top10.items(), columns = ['feature', 'importance'])
imp_df.index = imp_df['feature']
x_values = list(range(len(importances)))
fig, ax = plt.subplots(figsize = (12,8), dpi = 80)
ax = sns.barplot(x = imp_df.index, y = imp_df['importance'], orientation = 'vertical', color = '#fcbf49', alpha = 0.8)
plt.xticks(rotation = 40)
ax.set_ylabel('Importance', fontsize = 14, family = 'serif')
ax.set_xlabel('Feature', fontsize = 14, family = 'serif')
ax.set_title('Feature Importances - Random Forest', fontsize = 20, y = 1.03, family = 'serif')
for container in ax.containers:
ax.bar_label(container, fontsize = 10, family = 'serif', padding = 5, fmt = '%.3f')
plt.show()
plt.savefig('/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/plots/feature_random_forest.png')
<Figure size 432x288 with 0 Axes>
bestPipeline = lrModel.bestModel
bestModel = bestPipeline.stages[1]
lr_coef = bestModel.coefficients
coef_dict = dict(zip(feature_ls, np.array(lr_coef)))
coef_sort = {k: v for k, v in sorted(coef_dict.items(), key=lambda item: np.abs(item[1]), reverse=True)}
coef_top10 = {k: coef_sort[k] for k in list(coef_sort)[1:16]}
coef_df = pd.DataFrame(data = coef_top10.items(), columns = ['feature', 'coefficient'])
coef_df.index = coef_df['feature']
# x_values = list(range(len(lr_coef)))
fig, ax = plt.subplots(figsize = (12,8), dpi = 80)
ax = sns.barplot(x = coef_df.index, y = coef_df['coefficient'], color = '#fcbf49', alpha = 0.7)
plt.xticks(rotation = 40)
ax.set_ylabel('Importance', fontsize = 14, family = 'serif')
ax.set_xlabel('Feature', fontsize = 14, family = 'serif')
ax.set_title('Feature Coefficient - Linear Regression', fontsize = 20, y = 1.03, family = 'serif')
for container in ax.containers:
ax.bar_label(container, fontsize = 10, family = 'serif', padding = 5)
plt.savefig('/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/plots/feature_linear_regression.png')
Technical proposal: Fit Linear Regression Model to find the correlation between score/sentiment of each media and its IMDB Rating. Use both Linear Regression and Lasso Regression to predict the Rating based on features, tune the hyperparameters of Lasso Regression and finally evaluate the performances of all these models by the prediction results and metrics.
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
pd_media = df_media.toPandas()
imdb_df = pd.read_csv("/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/imdb_rating.csv")
#add imdb rating data to media dataframe
pd_media = pd.merge(pd_media,imdb_df)
media_ls = [i for i in sub_com_nlp.columns if i[:5] == 'movie' or i[:6] == 'series']
media_col_ls = ['score', 'sentiment'] + media_ls
media_ml = sub_com_nlp[media_col_ls]
score_media = ['series:wandavision',
'series:the falcon and the winter soldier',
'series:loki',
'movie:black widow',
'series:what if',
'movie:shang-chi',
'movie:eternals',
'series:hawkeye',
'movie:spider-man',
'series:moon knight',
'movie:doctor strange',
'series:ms marvel',
'movie:thor',
'series:she-hulk']
#calculate score for each media
score_ls = []
for i in score_media:
score = list(media_ml.filter(col(i) == "true").select(sum('score')).toPandas()['sum(score)'])[0]
score_ls.append(score)
pd_media['score'] = score_ls
#calculate positive mention count for each media
pos = []
for i in score_media:
pos_num = list(media_ml.filter(col(i) == "true").groupby(col('sentiment')).count().toPandas()['count'])[0]
pos.append(pos_num)
#calculate neural mention count for each media
neu = []
for i in score_media:
neu_num = list(media_ml.filter(col(i) == "true").groupby(col('sentiment')).count().toPandas()['count'])[2]
neu.append(neu_num)
#calculate negative mention count for each media
neg = []
for i in score_media:
neg_num = list(media_ml.filter(col(i) == "true").groupby(col('sentiment')).count().toPandas()['count'])[3]
neg.append(neg_num)
#add them into media dataframe
pd_media['positive_count'] = pos
pd_media['neural_count'] = neu
pd_media['negative_count'] = neg
#save media dataframe to csv
pd_media.to_csv('/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/media_ml.csv', index=False)
#split x, y for linear regression
feature_ls = ['score','positive_count','neural_count','negative_count']
X = pd_media.loc[:, feature_ls]
y = pd_media['imdb_rating']
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=549)
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
y_train_scaled = y_train.copy()
y_test_scaled = y_test.copy()
#scale x
num_scaler = MinMaxScaler()
X_train_scaled = num_scaler.fit_transform(X_train)
X_test_scaled = num_scaler.transform(X_test)
#scale y
target_scaler = MinMaxScaler()
y_train_scaled = target_scaler.fit_transform(np.array(y_train).reshape(-1,1))
y_test_scaled = target_scaler.transform(np.array(y_test).reshape(-1,1))
#fit linear regression model
model = LinearRegression()
model.fit(X_train_scaled,y_train_scaled)
Out[368]: LinearRegression()
#feature importance
importance = model.coef_
imp_df = pd.DataFrame(importance.tolist()[0], index = feature_ls, columns = ['importance'])
# summarize feature importance
for i,v in enumerate(importance.tolist()[0]):
print('Feature:', feature_ls[i])
print('Score: %.5f' % (v))
# plot feature importance
fig, ax = plt.subplots(figsize = (10,8), dpi = 80)
ax = sns.barplot(x = imp_df.index, y = imp_df['importance'], alpha = 0.7, color = '#fcbf49')
ax.set_title('Feature Importance Plot of Linear Regression', fontsize = 20, y = 1.03, family = 'serif')
ax.set_ylabel('Features', fontsize = 14, family = 'serif')
ax.set_xlabel('Importance', fontsize = 14, family = 'serif')
for container in ax.containers:
ax.bar_label(container, fontsize = 10, family = 'serif', padding = 5)
plt.show()
Feature: score Score: 2.54816 Feature: positive_count Score: -1.62949 Feature: neural_count Score: -1.79536 Feature: negative_count Score: 1.36032
#predict by scaled train data
y_pred = model.predict(X_train_scaled)
#Comparison of Predict Rating and True Rating
lr_train_rate = pd.DataFrame(target_scaler.inverse_transform(y_pred).tolist(), columns=['Linear Predict Rating'])
lr_train_rate['True Rating'] = y_train.tolist()
lr_train_rate
Linear Predict Rating | True Rating | |
---|---|---|
0 | 7.188347 | 7.9 |
1 | 7.155756 | 7.3 |
2 | 6.767678 | 6.2 |
3 | 6.856264 | 6.3 |
4 | 8.231075 | 8.2 |
5 | 7.519860 | 7.4 |
6 | 6.866379 | 7.4 |
7 | 7.468594 | 7.5 |
8 | 8.446047 | 8.3 |
#metrics
#r squre
r2 = r2_score(y_train_scaled, y_pred)
#mae
mae = mean_absolute_error(y_train_scaled, y_pred)
#mse
mse = mean_squared_error(y_train_scaled, y_pred)
#rmse
rmse = mean_squared_error(y_train_scaled, y_pred, squared=False)
#metrics table
lr_df = pd.DataFrame([[r2,mae,mse,rmse]], columns = ['R Square', 'MAE', 'MSE', 'RMSE'], index = ['linear regression train'])
lr_df
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
linear regression train | 0.660933 | 0.150362 | 0.037323 | 0.193191 |
#predict by scaled test data
y_test_pred = model.predict(X_test_scaled)
#Comparison of Predict Rating and True Rating
lr_test_rate = pd.DataFrame(target_scaler.inverse_transform(y_test_pred).tolist(), columns=['Linear Predict Rating'])
lr_test_rate['True Rating'] = y_test.tolist()
lr_test_rate
Linear Predict Rating | True Rating | |
---|---|---|
0 | 8.241021 | 6.3 |
1 | 7.104002 | 7.2 |
2 | 7.546718 | 6.9 |
3 | 6.831790 | 5.2 |
4 | 7.289781 | 6.7 |
#metrics
#r squre
r2_test = r2_score(y_test_scaled, y_test_pred)
#mae
mae_test = mean_absolute_error(y_test_scaled, y_test_pred)
#mse
mse_test = mean_squared_error(y_test_scaled, y_test_pred)
#rmse
rmse_test = mean_squared_error(y_test_scaled, y_test_pred, squared=False)
#metrics table
lr_test_df = pd.DataFrame([[r2_test,mae_test,mse_test,rmse_test]], columns = ['R Square', 'MAE', 'MSE', 'RMSE'], index = ['linear regression test'])
lr_test_df
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
linear regression test | -1.987397 | 0.467172 | 0.326785 | 0.571651 |
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
#fit lasso model
lasso =Lasso(alpha=1.0)
lasso.fit(X_train_scaled, y_train_scaled)
Out[333]: Lasso()
#predict by scaled train data
pred_train_lasso = lasso.predict(X_train_scaled)
#metrics
#r squre
r2_train_lasso = r2_score(y_train_scaled,pred_train_lasso)
#mae
mae_train_lasso = mean_absolute_error(y_train_scaled, pred_train_lasso)
#mse
mse_train_lasso = mean_squared_error(y_train_scaled, pred_train_lasso)
#rmse
rmse_train_lasso = mean_squared_error(y_train_scaled,pred_train_lasso, squared=False)
#metrics table
lasso_train_df = pd.DataFrame([[r2_train_lasso,mae_train_lasso,mse_train_lasso,rmse_train_lasso]], columns = ['R Square', 'MAE', 'MSE', 'RMSE'], index = ['lasso train'])
lasso_train_df
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
lasso train | 0.0 | 0.250441 | 0.110075 | 0.331776 |
#predict by scaled test data
pred_test_lasso = lasso.predict(X_test_scaled)
#metrics
#r squre
r2_test_lasso = r2_score(y_test_scaled,pred_test_lasso)
#mae
mae_test_lasso = mean_absolute_error(y_test_scaled, pred_test_lasso)
#mse
mse_test_lasso = mean_squared_error(y_test_scaled, pred_test_lasso)
#rmse
rmse_test_lasso = mean_squared_error(y_test_scaled,pred_test_lasso, squared=False)
#metrics table
lasso_test_df = pd.DataFrame([[r2_test_lasso,mae_test_lasso,mse_test_lasso,rmse_test_lasso]], columns = ['R Square', 'MAE', 'MSE', 'RMSE'], index = ['lasso test'])
lasso_test_df
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
lasso test | -1.788629 | 0.442328 | 0.305042 | 0.552306 |
# Tuning Lasso Hyperparameters
# define model evaluation method
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
# define model
lasso = LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)
# fit model
lasso.fit(X_train_scaled, y_train_scaled)
/databricks/python/lib/python3.9/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). return f(*args, **kwargs) /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.10931310802615724, tolerance: 4.794298671849688e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.08085285674146903, tolerance: 5.701328150307742e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.08731870634097749, tolerance: 9.238743116294132e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.12261640937599771, tolerance: 9.873663751214768e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.14013396947757117, tolerance: 9.90646258503401e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.09971021069915487, tolerance: 6.880466472303204e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.09655479533288913, tolerance: 6.252024619371556e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.10312339405381366, tolerance: 8.19565921606738e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.12020885588417574, tolerance: 7.78749595076125e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.08790943138904972, tolerance: 9.240362811791378e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.15598347265376766, tolerance: 9.860706187236795e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.07101191726389133, tolerance: 6.517654680919985e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.10931310802615724, tolerance: 4.794298671849688e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.0035464694098918947, tolerance: 9.906057661159697e-05 model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: UserWarning: Coordinate descent with alpha=0 may lead to unexpected results and is discouraged. model = cd_fast.enet_coordinate_descent_gram( /databricks/python/lib/python3.9/site-packages/sklearn/linear_model/_coordinate_descent.py:526: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 0.15907390751062933, tolerance: 8.228458049886621e-05 model = cd_fast.enet_coordinate_descent_gram( Out[340]: LassoCV(alphas=array([0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]), cv=RepeatedKFold(n_repeats=3, n_splits=5, random_state=1), n_jobs=-1)
#predict by scaled train data
pred_train_lasso_conf = lasso.predict(X_train_scaled)
#Comparison of Predict Rating and True Rating
lasso_train_rate = pd.DataFrame(target_scaler.inverse_transform(pred_train_lasso_conf.reshape(-1, 1)).tolist(), columns=['Lasso Predict Rating'])
lasso_train_rate['True Rating'] = y_train.tolist()
lasso_train_rate
Lasso Predict Rating | True Rating | |
---|---|---|
0 | 7.355075 | 7.9 |
1 | 6.970137 | 7.3 |
2 | 6.855368 | 6.2 |
3 | 7.169788 | 6.3 |
4 | 8.232496 | 8.2 |
5 | 7.457900 | 7.4 |
6 | 7.048901 | 7.4 |
7 | 7.208643 | 7.5 |
8 | 8.201692 | 8.3 |
#metrics
#r squre
r2_train_lasso_conf = r2_score(y_train_scaled,pred_train_lasso_conf)
#mae
mae_train_lasso_conf = mean_absolute_error(y_train_scaled, pred_train_lasso_conf)
#mse
mse_train_lasso_conf = mean_squared_error(y_train_scaled, pred_train_lasso_conf)
#rmse
rmse_train_lasso_conf = mean_squared_error(y_train_scaled,pred_train_lasso_conf, squared=False)
#metrics table
lasso_train_df_conf = pd.DataFrame([[r2_train_lasso_conf,mae_train_lasso_conf,mse_train_lasso_conf,rmse_train_lasso_conf]], columns = ['R Square', 'MAE', 'MSE', 'RMSE'], index = ['Tuning lasso train'])
lasso_train_df_conf
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
Tuning lasso train | 0.584786 | 0.170958 | 0.045705 | 0.213787 |
#predict by scaled test data
pred_test_lasso_conf = lasso.predict(X_test_scaled)
#Comparison of Predict Rating and True Rating
lasso_test_rate = pd.DataFrame(target_scaler.inverse_transform(pred_test_lasso_conf.reshape(-1, 1)).tolist(), columns=['Lasso Predict Rating'])
lasso_test_rate['True Rating'] = y_test.tolist()
lasso_test_rate
Lasso Predict Rating | True Rating | |
---|---|---|
0 | 8.237978 | 6.3 |
1 | 6.713171 | 7.2 |
2 | 7.069394 | 6.9 |
3 | 6.832388 | 5.2 |
4 | 7.047747 | 6.7 |
#metrics
#r squre
r2_test_lasso_conf = r2_score(y_test_scaled,pred_test_lasso_conf)
#mae
mae_test_lasso_conf = mean_absolute_error(y_test_scaled, pred_test_lasso_conf)
#mse
mse_test_lasso_conf = mean_squared_error(y_test_scaled, pred_test_lasso_conf)
#rmse
rmse_test_lasso_conf = mean_squared_error(y_test_scaled,pred_test_lasso_conf, squared=False)
#metrics table
lasso_test_df_conf = pd.DataFrame([[r2_test_lasso_conf,mae_test_lasso_conf,mse_test_lasso_conf,rmse_test_lasso_conf]], columns = ['R Square', 'MAE', 'MSE', 'RMSE'], index = ['Tuning lasso test'])
lasso_test_df_conf
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
Tuning lasso test | -1.822171 | 0.435651 | 0.308711 | 0.555618 |
#append all metrics tables and compare
metrics_df = pd.concat([lr_df,lr_test_df,lasso_train_df,lasso_test_df,lasso_train_df_conf,lasso_test_df_conf])
metrics_df
R Square | MAE | MSE | RMSE | |
---|---|---|---|---|
linear regression train | 0.660933 | 0.150362 | 0.037323 | 0.193191 |
linear regression test | -1.987397 | 0.467172 | 0.326785 | 0.571651 |
lasso train | 0.000000 | 0.250441 | 0.110075 | 0.331776 |
lasso test | -1.788629 | 0.442328 | 0.305042 | 0.552306 |
Tuning lasso train | 0.584786 | 0.170958 | 0.045705 | 0.213787 |
Tuning lasso test | -1.822171 | 0.435651 | 0.308711 | 0.555618 |
#append all predict results of train and test data and compare with the true value
train_rate = lr_train_rate.merge(lasso_train_rate)
train_rate
Linear Predict Rating | True Rating | Lasso Predict Rating | |
---|---|---|---|
0 | 7.188347 | 7.9 | 7.355075 |
1 | 7.155756 | 7.3 | 6.970137 |
2 | 6.767678 | 6.2 | 6.855368 |
3 | 6.856264 | 6.3 | 7.169788 |
4 | 8.231075 | 8.2 | 8.232496 |
5 | 7.519860 | 7.4 | 7.457900 |
6 | 7.519860 | 7.4 | 7.048901 |
7 | 6.866379 | 7.4 | 7.457900 |
8 | 6.866379 | 7.4 | 7.048901 |
9 | 7.468594 | 7.5 | 7.208643 |
10 | 8.446047 | 8.3 | 8.201692 |
test_rate = lr_test_rate.merge(lasso_test_rate)
test_rate
Linear Predict Rating | True Rating | Lasso Predict Rating | |
---|---|---|---|
0 | 8.241021 | 6.3 | 8.237978 |
1 | 7.104002 | 7.2 | 6.713171 |
2 | 7.546718 | 6.9 | 7.069394 |
3 | 6.831790 | 5.2 | 6.832388 |
4 | 7.289781 | 6.7 | 7.047747 |
# fig, ax = plt.subplots(1,2,figsize=(15,6))
# ax[0].plot(train_rate['Linear Predict Rating'],linewidth=2)
# ax[0].plot(train_rate['Lasso Predict Rating'],linewidth=2)
# ax[0].plot(train_rate['True Rating'],linewidth=2)
# ax[0].set_title('Train Data',family = 'serif',fontsize = 15)
# ax[0].set_xlabel('Media Number',family = 'serif',fontsize = 15)
# ax[0].set_ylabel('Rating',family = 'serif',fontsize = 15)
# ax[0].legend(['Linear', 'Lasso', "True"])
# ax[1].plot(test_rate['Linear Predict Rating'],linewidth=2)
# ax[1].plot(test_rate['Lasso Predict Rating'],linewidth=2)
# ax[1].plot(test_rate['True Rating'],linewidth=2)
# ax[1].set_title('Test Data',family = 'serif',fontsize = 15)
# ax[1].set_xlabel('Media Number',family = 'serif',fontsize = 15)
# ax[1].set_ylabel('Rating',family = 'serif',fontsize= 15)
# ax[1].legend(['Linear', 'Lasso', "True"])
# fig.suptitle('Comparison among Predict Rating Results From Different Models and True Rating Data',fontsize = 20,family = 'serif')