from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from datetime import datetime
import random as rd
# Load processed intermediate data from parquet files
sub_com_with_dummy = spark.read.parquet("/FileStore/sub_com/with_dummy/")
sub_com_with_dummy.printSchema()
root |-- id: string (nullable = true) |-- author: string (nullable = true) |-- created_ts: timestamp (nullable = true) |-- content: string (nullable = true) |-- score: integer (nullable = true) |-- is_submission: boolean (nullable = true) |-- character:black widow: boolean (nullable = true) |-- character:yelena belova: boolean (nullable = true) |-- character:shang-chi: boolean (nullable = true) |-- character:sersi: boolean (nullable = true) |-- character:ikaris: boolean (nullable = true) |-- character:thena: boolean (nullable = true) |-- character:ajak: boolean (nullable = true) |-- character:spider-man: boolean (nullable = true) |-- character:doctor strange: boolean (nullable = true) |-- character:electro: boolean (nullable = true) |-- character:green goblin: boolean (nullable = true) |-- character:doc ock: boolean (nullable = true) |-- character:wong: boolean (nullable = true) |-- character:wanda: boolean (nullable = true) |-- character:thor: boolean (nullable = true) |-- character:jane foster: boolean (nullable = true) |-- character:gorr: boolean (nullable = true) |-- character:vision: boolean (nullable = true) |-- character:agnes: boolean (nullable = true) |-- character:falcon: boolean (nullable = true) |-- character:bucky: boolean (nullable = true) |-- character:john walker: boolean (nullable = true) |-- character:captain america: boolean (nullable = true) |-- character:loki: boolean (nullable = true) |-- character:casey: boolean (nullable = true) |-- character:the watcher: boolean (nullable = true) |-- character:nick fury: boolean (nullable = true) |-- character:iron man: boolean (nullable = true) |-- character:hawkeye: boolean (nullable = true) |-- character:ultron: boolean (nullable = true) |-- character:red skull: boolean (nullable = true) |-- character:captain marvel: boolean (nullable = true) |-- character:captain carter: boolean (nullable = true) |-- character:hulk: boolean (nullable = true) |-- character:nebula: boolean (nullable = true) |-- character:hank pym: boolean (nullable = true) |-- character:ant-man: boolean (nullable = true) |-- character:thanos: boolean (nullable = true) |-- character:kate bishop: boolean (nullable = true) |-- character:kingpin: boolean (nullable = true) |-- character:moon knight: boolean (nullable = true) |-- character:arthur harrow: boolean (nullable = true) |-- character:kamala khan: boolean (nullable = true) |-- character:she-hulk: boolean (nullable = true) |-- character:abomination: boolean (nullable = true) |-- character:odin: boolean (nullable = true) |-- character:pepper potts: boolean (nullable = true) |-- character:mj: boolean (nullable = true) |-- character:ned: boolean (nullable = true) |-- character:happy: boolean (nullable = true) |-- movie:black widow: boolean (nullable = true) |-- movie:shang-chi: boolean (nullable = true) |-- movie:eternals: boolean (nullable = true) |-- movie:spider-man: boolean (nullable = true) |-- movie:doctor strange: boolean (nullable = true) |-- movie:thor: boolean (nullable = true) |-- series:wandavision: boolean (nullable = true) |-- series:the falcon and the winter soldier: boolean (nullable = true) |-- series:loki: boolean (nullable = true) |-- series:what if: boolean (nullable = true) |-- series:hawkeye: boolean (nullable = true) |-- series:moon knight: boolean (nullable = true) |-- series:ms marvel: boolean (nullable = true) |-- series:she-hulk: boolean (nullable = true)
# cleaning pipeline
document_assembler = DocumentAssembler() \
.setInputCol("content") \
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
normalizer= Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized")\
.setCleanupPatterns(["[^\w\d\s]"])
stop_words = StopWordsCleaner.pretrained("stopwords_en", "en") \
.setInputCols(["normalized"]) \
.setOutputCol("clean_normalized")
lemmatizer = LemmatizerModel.pretrained() \
.setInputCols(["clean_normalized"]) \
.setOutputCol("lemma")
token_assembler = TokenAssembler()\
.setInputCols(["document", "lemma"])\
.setOutputCol("assembled")
clean_pipeline = Pipeline(stages=[document_assembler, tokenizer, normalizer, stop_words, lemmatizer, token_assembler])
stopwords_en download started this may take some time. Approximate size to download 2.9 KB [ | ] [OK!] lemma_antbnc download started this may take some time. Approximate size to download 907.6 KB [ | ] [OK!]
cleaned_df = clean_pipeline.fit(sub_com_with_dummy).transform(sub_com_with_dummy)
cleaned_df.select(col('assembled.result')).show(10, truncate=False)
+--------------------------------------------------------------------------------------------+ |result | +--------------------------------------------------------------------------------------------+ |[white wolf] | |[dont race] | |[huh lowkey salty] | |[decent guy unfairly treat sambucky] | |[hey im bromo humor change tone episode episode general feel goofy] | |[battlestar join bucky sam stop walker deep end care friend what] | |[racist micro aggression place steve arrogance think mantel sam publicly give shield museum]| |[agatha nosy neighbor 350] | |[story make knot hardcore fan half think] | |[nice 5he character breath experience wouldnt 2 hr movie] | +--------------------------------------------------------------------------------------------+ only showing top 10 rows
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
.setInputCols(["assembled"])\
.setOutputCol("sentence_embeddings")
sentimentdl = SentimentDLModel.pretrained(name="sentimentdl_use_twitter", lang="en")\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("sentiment")
sentiment_pipeline = Pipeline(stages=[use, sentimentdl])
tfhub_use download started this may take some time. Approximate size to download 923.7 MB [ | ] [OK!] sentimentdl_use_twitter download started this may take some time. Approximate size to download 11.4 MB [ | ] [OK!]
sentiment_df = sentiment_pipeline.fit(cleaned_df).transform(cleaned_df)
sentiment_df.select(col('assembled.result').getItem(0).alias("content"), col("sentiment.result").getItem(0).alias("sentiment")).show()
+--------------------+---------+ | content|sentiment| +--------------------+---------+ | white wolf| positive| | dont race| negative| | huh lowkey salty| negative| |decent guy unfair...| neutral| |hey im bromo humo...| positive| |battlestar join b...| negative| |racist micro aggr...| positive| |agatha nosy neigh...| negative| |story make knot h...| positive| |nice 5he characte...| positive| |abouthttpsyoutube...| positive| |feel walker cap o...| positive| | wait| positive| |bucky fall helica...| negative| |love shit talk sa...| positive| | username check| positive| |yea kind assume s...| positive| |uh sense leader t...| negative| | ao mcu| positive| |episode make laug...| positive| +--------------------+---------+ only showing top 20 rows
sentiment_df.printSchema()
root |-- id: string (nullable = true) |-- author: string (nullable = true) |-- created_ts: timestamp (nullable = true) |-- content: string (nullable = true) |-- score: integer (nullable = true) |-- is_submission: boolean (nullable = true) |-- character:black widow: boolean (nullable = true) |-- character:yelena belova: boolean (nullable = true) |-- character:shang-chi: boolean (nullable = true) |-- character:sersi: boolean (nullable = true) |-- character:ikaris: boolean (nullable = true) |-- character:thena: boolean (nullable = true) |-- character:ajak: boolean (nullable = true) |-- character:spider-man: boolean (nullable = true) |-- character:doctor strange: boolean (nullable = true) |-- character:electro: boolean (nullable = true) |-- character:green goblin: boolean (nullable = true) |-- character:doc ock: boolean (nullable = true) |-- character:wong: boolean (nullable = true) |-- character:wanda: boolean (nullable = true) |-- character:thor: boolean (nullable = true) |-- character:jane foster: boolean (nullable = true) |-- character:gorr: boolean (nullable = true) |-- character:vision: boolean (nullable = true) |-- character:agnes: boolean (nullable = true) |-- character:falcon: boolean (nullable = true) |-- character:bucky: boolean (nullable = true) |-- character:john walker: boolean (nullable = true) |-- character:captain america: boolean (nullable = true) |-- character:loki: boolean (nullable = true) |-- character:casey: boolean (nullable = true) |-- character:the watcher: boolean (nullable = true) |-- character:nick fury: boolean (nullable = true) |-- character:iron man: boolean (nullable = true) |-- character:hawkeye: boolean (nullable = true) |-- character:ultron: boolean (nullable = true) |-- character:red skull: boolean (nullable = true) |-- character:captain marvel: boolean (nullable = true) |-- character:captain carter: boolean (nullable = true) |-- character:hulk: boolean (nullable = true) |-- character:nebula: boolean (nullable = true) |-- character:hank pym: boolean (nullable = true) |-- character:ant-man: boolean (nullable = true) |-- character:thanos: boolean (nullable = true) |-- character:kate bishop: boolean (nullable = true) |-- character:kingpin: boolean (nullable = true) |-- character:moon knight: boolean (nullable = true) |-- character:arthur harrow: boolean (nullable = true) |-- character:kamala khan: boolean (nullable = true) |-- character:she-hulk: boolean (nullable = true) |-- character:abomination: boolean (nullable = true) |-- character:odin: boolean (nullable = true) |-- character:pepper potts: boolean (nullable = true) |-- character:mj: boolean (nullable = true) |-- character:ned: boolean (nullable = true) |-- character:happy: boolean (nullable = true) |-- movie:black widow: boolean (nullable = true) |-- movie:shang-chi: boolean (nullable = true) |-- movie:eternals: boolean (nullable = true) |-- movie:spider-man: boolean (nullable = true) |-- movie:doctor strange: boolean (nullable = true) |-- movie:thor: boolean (nullable = true) |-- series:wandavision: boolean (nullable = true) |-- series:the falcon and the winter soldier: boolean (nullable = true) |-- series:loki: boolean (nullable = true) |-- series:what if: boolean (nullable = true) |-- series:hawkeye: boolean (nullable = true) |-- series:moon knight: boolean (nullable = true) |-- series:ms marvel: boolean (nullable = true) |-- series:she-hulk: boolean (nullable = true) |-- document: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- normalized: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- clean_normalized: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- lemma: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- assembled: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentence_embeddings: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentiment: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false)
# sub_com_cleaned = sentiment_df.select()
dummy_colnames = [c for c in sentiment_df.columns if c.startswith("character:") or c.startswith("movie:") or c.startswith("series:")]
sub_com_nlp = sentiment_df.select("id", "author", "created_ts", col('assembled.result').getItem(0).alias("content"), "score", "is_submission", col("sentiment.result").getItem(0).alias("sentiment"), *dummy_colnames)
sub_com_nlp.printSchema()
root |-- id: string (nullable = true) |-- author: string (nullable = true) |-- created_ts: timestamp (nullable = true) |-- content: string (nullable = true) |-- score: integer (nullable = true) |-- is_submission: boolean (nullable = true) |-- sentiment: string (nullable = true) |-- character:black widow: boolean (nullable = true) |-- character:yelena belova: boolean (nullable = true) |-- character:shang-chi: boolean (nullable = true) |-- character:sersi: boolean (nullable = true) |-- character:ikaris: boolean (nullable = true) |-- character:thena: boolean (nullable = true) |-- character:ajak: boolean (nullable = true) |-- character:spider-man: boolean (nullable = true) |-- character:doctor strange: boolean (nullable = true) |-- character:electro: boolean (nullable = true) |-- character:green goblin: boolean (nullable = true) |-- character:doc ock: boolean (nullable = true) |-- character:wong: boolean (nullable = true) |-- character:wanda: boolean (nullable = true) |-- character:thor: boolean (nullable = true) |-- character:jane foster: boolean (nullable = true) |-- character:gorr: boolean (nullable = true) |-- character:vision: boolean (nullable = true) |-- character:agnes: boolean (nullable = true) |-- character:falcon: boolean (nullable = true) |-- character:bucky: boolean (nullable = true) |-- character:john walker: boolean (nullable = true) |-- character:captain america: boolean (nullable = true) |-- character:loki: boolean (nullable = true) |-- character:casey: boolean (nullable = true) |-- character:the watcher: boolean (nullable = true) |-- character:nick fury: boolean (nullable = true) |-- character:iron man: boolean (nullable = true) |-- character:hawkeye: boolean (nullable = true) |-- character:ultron: boolean (nullable = true) |-- character:red skull: boolean (nullable = true) |-- character:captain marvel: boolean (nullable = true) |-- character:captain carter: boolean (nullable = true) |-- character:hulk: boolean (nullable = true) |-- character:nebula: boolean (nullable = true) |-- character:hank pym: boolean (nullable = true) |-- character:ant-man: boolean (nullable = true) |-- character:thanos: boolean (nullable = true) |-- character:kate bishop: boolean (nullable = true) |-- character:kingpin: boolean (nullable = true) |-- character:moon knight: boolean (nullable = true) |-- character:arthur harrow: boolean (nullable = true) |-- character:kamala khan: boolean (nullable = true) |-- character:she-hulk: boolean (nullable = true) |-- character:abomination: boolean (nullable = true) |-- character:odin: boolean (nullable = true) |-- character:pepper potts: boolean (nullable = true) |-- character:mj: boolean (nullable = true) |-- character:ned: boolean (nullable = true) |-- character:happy: boolean (nullable = true) |-- movie:black widow: boolean (nullable = true) |-- movie:shang-chi: boolean (nullable = true) |-- movie:eternals: boolean (nullable = true) |-- movie:spider-man: boolean (nullable = true) |-- movie:doctor strange: boolean (nullable = true) |-- movie:thor: boolean (nullable = true) |-- series:wandavision: boolean (nullable = true) |-- series:the falcon and the winter soldier: boolean (nullable = true) |-- series:loki: boolean (nullable = true) |-- series:what if: boolean (nullable = true) |-- series:hawkeye: boolean (nullable = true) |-- series:moon knight: boolean (nullable = true) |-- series:ms marvel: boolean (nullable = true) |-- series:she-hulk: boolean (nullable = true)
# Save processed nlp dataframe into parquet files
sub_com_nlp.write.parquet("/FileStore/sub_com/nlp/")
# Load processed nlp dataframe from parquet files
sub_com_nlp = spark.read.parquet("/FileStore/sub_com/nlp/")
# Load external data
#Load imdb movie data
imdb = pd.read_csv("/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/imdb_rating.csv")
df_char = spark.read.parquet("/FileStore/sub_com/character_count/")
df_media = spark.read.parquet("/FileStore/sub_com/media_count/")
media_timeline_pd = df_media.toPandas()
#calculate the top 10 most common words in content column
common_words = sub_com_nlp.withColumn('words', explode(split('content', ' '))) \
.groupBy('words') \
.count() \
.orderBy(desc('count')) \
.limit(10)
common_words.show()
+---------+------+ | words| count| +---------+------+ | movie|728717| | make|594576| | show|491784| | dont|488970| |character|440987| | people|439364| | im|435627| | time|432575| | mcu|426524| | marvel|384753| +---------+------+
#calculate the length of each content
text_length = sub_com_nlp.select(col('content'), size(split(col('content'), ' ')).alias('word_count'))
text_length.show()
+--------------------+----------+ | content|word_count| +--------------------+----------+ | white wolf| 2| | dont race| 2| | huh lowkey salty| 3| |decent guy unfair...| 5| |hey im bromo humo...| 11| |battlestar join b...| 11| |racist micro aggr...| 13| |agatha nosy neigh...| 4| |story make knot h...| 7| |nice 5he characte...| 9| |abouthttpsyoutube...| 6| |feel walker cap o...| 9| | wait| 1| |bucky fall helica...| 9| |love shit talk sa...| 7| | username check| 2| |yea kind assume s...| 17| |uh sense leader t...| 6| | ao mcu| 2| |episode make laug...| 4| +--------------------+----------+ only showing top 20 rows
length_pd = text_length.toPandas()
#distribution
length_distribution = text_length.select(col('word_count')).groupby('word_count').count().orderBy(asc('word_count'))
distribution_pd = length_distribution.toPandas()
distribution_pd
word_count | count | |
---|---|---|
0 | 1 | 736297 |
1 | 2 | 450477 |
2 | 3 | 453257 |
3 | 4 | 416889 |
4 | 5 | 369788 |
... | ... | ... |
862 | 3156 | 1 |
863 | 3189 | 1 |
864 | 3453 | 2 |
865 | 3565 | 2 |
866 | 3836 | 1 |
867 rows × 2 columns
# create a funtion to bin the text length
def bin_text(dat, max_len = 50, min_len = 0, binsize = 1):
"""
bin the text length into groups
max_len: the maximum of the text length
binsize: the size of each bin
dat: pd.DataFrame, with column word_count
"""
bin_ls = list(range(min_len, max_len + binsize, binsize)) + [np.max(dat['word_count'])]
bin_labels = [f'{i}-{i+binsize}' for i in range(min_len, max_len, binsize)] + [f'>{max_len}']
dat['group'] = pd.cut(dat['word_count'], bins = bin_ls, labels = bin_labels)
return dat
# change if you like
max_len, min_len = 50, 0
binsize = 2
binned_df = bin_text(distribution_pd, max_len, min_len, binsize)
binned_df = binned_df.groupby('group').apply(lambda df: df['count'].sum()).to_frame('count').reset_index()
# distribution plot
fig, ax = plt.subplots(figsize = (18,10), dpi = 300)
sns.set_theme(style = 'white')
ax = sns.barplot(x = binned_df['group'], y = binned_df['count'], alpha = 0.7, color = '#fcbf49')
for container in ax.containers:
ax.bar_label(container, fontsize = 8)
plt.rcParams['font.family'] = 'P052'
ax.set_title('Text Length Distribution', fontsize = 20, y = 1.03)
ax.set_ylabel('Count', fontsize = 14)
ax.set_xlabel('Text Length', fontsize = 14)
plt.savefig('text_len_dist.png')
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import udf,lit
#extract content information
content_df = sub_com_nlp.select(col('content'))
content_df = content_df.withColumn("text_id", monotonically_increasing_id())
#bag-of-words
bow_df = (content_df.rdd\
.map(lambda x : (x.text_id,x.content.split(" ")))\
.toDF()\
.withColumnRenamed("_1","text_id")\
.withColumnRenamed("_2","features"))
#TF
htf = MLHashingTF(inputCol="features", outputCol="tf")
tf = htf.transform(bow_df)
tf.show()
+-------+--------------------+--------------------+ |text_id| features| tf| +-------+--------------------+--------------------+ | 0| [white, wolf]|(262144,[75571,21...| | 1| [dont, race]|(262144,[87273,22...| | 2|[huh, lowkey, salty]|(262144,[180689,2...| | 3|[decent, guy, unf...|(262144,[34611,16...| | 4|[hey, im, bromo, ...|(262144,[31015,61...| | 5|[battlestar, join...|(262144,[54502,74...| | 6|[racist, micro, a...|(262144,[5481,395...| | 7|[agatha, nosy, ne...|(262144,[133662,1...| | 8|[story, make, kno...|(262144,[17252,77...| | 9|[nice, 5he, chara...|(262144,[12524,22...| | 10|[abouthttpsyoutub...|(262144,[5451,500...| | 11|[feel, walker, ca...|(262144,[30796,54...| | 12| [wait]|(262144,[150069],...| | 13|[bucky, fall, hel...|(262144,[37521,42...| | 14|[love, shit, talk...|(262144,[54502,64...| | 15| [username, check]|(262144,[23032,20...| | 16|[yea, kind, assum...|(262144,[8443,488...| | 17|[uh, sense, leade...|(262144,[29129,11...| | 18| [ao, mcu]|(262144,[170147,2...| | 19|[episode, make, l...|(262144,[2437,897...| +-------+--------------------+--------------------+ only showing top 20 rows
#IDF
idf = MLIDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(5, truncate=False)
+-------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |text_id|features |tf |idf | +-------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |0 |[white, wolf] |(262144,[75571,211527],[1.0,1.0]) |(262144,[75571,211527],[4.969006641728334,7.480357711105211]) | |1 |[dont, race] |(262144,[87273,227686],[1.0,1.0]) |(262144,[87273,227686],[2.5815101421520965,6.076172362698935]) | |2 |[huh, lowkey, salty] |(262144,[180689,230868,246953],[1.0,1.0,1.0]) |(262144,[180689,230868,246953],[8.203474561356371,6.579460854136553,6.474726267612691]) | |3 |[decent, guy, unfairly, treat, sambucky] |(262144,[34611,161102,163240,257513,259390],[1.0,1.0,1.0,1.0,1.0]) |(262144,[34611,161102,163240,257513,259390],[10.007210221631142,3.8406961442001126,6.27442918245799,9.503639768746323,5.953506389680384]) | |4 |[hey, im, bromo, humor, change, tone, episode, episode, general, feel, goofy]|(262144,[31015,61756,61899,62133,75898,109557,113241,140461,193181,223059],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0])|(262144,[31015,61756,61899,62133,75898,109557,113241,140461,193181,223059],[2.695232909381837,5.694015111736679,3.174706602237098,6.472381050777942,5.878263772926869,13.32743854075963,5.348746734570755,7.264986649321078,7.043097104322763,4.279695532729672])| +-------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ only showing top 5 rows
#create an array column for tfidf value for future analysis
list_ = udf(lambda v: v.values.tolist(), ArrayType(DoubleType()))
tfidf = tfidf.withColumn("idf_list", list_("idf"))
#index of the max value of each list
tfidf = tfidf.withColumn('max_value_index',expr("array_position(idf_list,array_max(idf_list))-1"))
tfidf.show()
+-------+--------------------+--------------------+--------------------+--------------------+---------------+ |text_id| features| tf| idf| idf_list|max_value_index| +-------+--------------------+--------------------+--------------------+--------------------+---------------+ | 0| [white, wolf]|(262144,[75571,21...|(262144,[75571,21...|[4.96900664172833...| 1| | 1| [dont, race]|(262144,[87273,22...|(262144,[87273,22...|[2.58151014215209...| 1| | 2|[huh, lowkey, salty]|(262144,[180689,2...|(262144,[180689,2...|[8.20347456135637...| 0| | 3|[decent, guy, unf...|(262144,[34611,16...|(262144,[34611,16...|[10.0072102216311...| 0| | 4|[hey, im, bromo, ...|(262144,[31015,61...|(262144,[31015,61...|[2.69523290938183...| 5| | 5|[battlestar, join...|(262144,[54502,74...|(262144,[54502,74...|[4.68169578991286...| 3| | 6|[racist, micro, a...|(262144,[5481,395...|(262144,[5481,395...|[9.86867130727599...| 0| | 7|[agatha, nosy, ne...|(262144,[133662,1...|(262144,[133662,1...|[10.3541791230576...| 0| | 8|[story, make, kno...|(262144,[17252,77...|(262144,[17252,77...|[4.88724639127954...| 6| | 9|[nice, 5he, chara...|(262144,[12524,22...|(262144,[12524,22...|[3.63985690562423...| 4| | 10|[abouthttpsyoutub...|(262144,[5451,500...|(262144,[5451,500...|[4.45493578155102...| 5| | 11|[feel, walker, ca...|(262144,[30796,54...|(262144,[30796,54...|[4.27765823379588...| 5| | 12| [wait]|(262144,[150069],...|(262144,[150069],...| [4.417557824261202]| 0| | 13|[bucky, fall, hel...|(262144,[37521,42...|(262144,[37521,42...|[10.2431655894537...| 0| | 14|[love, shit, talk...|(262144,[54502,64...|(262144,[54502,64...|[4.68169578991286...| 0| | 15| [username, check]|(262144,[23032,20...|(262144,[23032,20...|[8.07807801893812...| 0| | 16|[yea, kind, assum...|(262144,[8443,488...|(262144,[8443,488...|[5.97144251320312...| 10| | 17|[uh, sense, leade...|(262144,[29129,11...|(262144,[29129,11...|[7.35532672216746...| 0| | 18| [ao, mcu]|(262144,[170147,2...|(262144,[170147,2...|[5.74260900584325...| 0| | 19|[episode, make, l...|(262144,[2437,897...|(262144,[2437,897...|[4.49151690848628...| 2| +-------+--------------------+--------------------+--------------------+--------------------+---------------+ only showing top 20 rows
tfidf_max.show(truncate=False)
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+---------------+ |text_id|idf_list |max_value |features |max_value_index| +-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+---------------+ |0 |[4.969006641728334, 7.480357711105211] |7.480357711105211 |[white, wolf] |1 | |1 |[2.5815101421520965, 6.076172362698935] |6.076172362698935 |[dont, race] |1 | |2 |[8.203474561356371, 6.579460854136553, 6.474726267612691] |8.203474561356371 |[huh, lowkey, salty] |0 | |3 |[10.007210221631142, 3.8406961442001126, 6.27442918245799, 9.503639768746323, 5.953506389680384] |10.007210221631142|[decent, guy, unfairly, treat, sambucky] |0 | |4 |[2.695232909381837, 5.694015111736679, 3.174706602237098, 6.472381050777942, 5.878263772926869, 13.32743854075963, 5.348746734570755, 7.264986649321078, 7.043097104322763, 4.279695532729672] |13.32743854075963 |[hey, im, bromo, humor, change, tone, episode, episode, general, feel, goofy] |5 | |5 |[4.681695789912868, 4.747875845490661, 4.889577198625523, 8.294824339944599, 5.899831826804189, 4.550397067821048, 3.413126452140354, 4.6725531803223035, 4.472995637256793, 5.9477370760724195, 5.303449371759564] |8.294824339944599 |[battlestar, join, bucky, sam, stop, walker, deep, end, care, friend, what] |3 | |6 |[9.868671307275998, 7.800216472462312, 4.3922516454728715, 3.767568693005586, 7.856569409013444, 3.398315730631671, 6.204213523439637, 4.6725531803223035, 9.525726556149166, 7.932296989576053, 9.458555027992102, 4.622457279712255, 4.565035938569167] |9.868671307275998 |[racist, micro, aggression, place, steve, arrogance, think, mantel, sam, publicly, give, shield, museum] |0 | |7 |[10.354179123057698, 5.428161379227023, 10.267167746068068, 8.127714515474835] |10.354179123057698|[agatha, nosy, neighbor, 350] |0 | |8 |[4.887246391279542, 7.862665559193956, 2.4379144831788278, 4.097360903100415, 3.690016538450417, 3.398315730631671, 9.897042004405213] |9.897042004405213 |[story, make, knot, hardcore, fan, half, think] |6 | |9 |[3.639856905624234, 4.894496901790983, 5.148862106027095, 7.494579023828287, 12.480140680372427, 4.052799259683881, 8.67136402490977, 2.3627809950312626, 2.8357724248149934] |12.480140680372427|[nice, 5he, character, breath, experience, wouldnt, 2, hr, movie] |4 | |10 |[4.4549357815510255, 5.103513364534742, 4.172199875566682, 3.7275065283500983, 4.6725531803223035, 14.138368756975959] |14.138368756975959|[abouthttpsyoutubemoye_l80d4y, kid, call, sam, black, falcon] |5 | |11 |[4.277658233795889, 3.2983124680830587, 3.174706602237098, 3.3604755319391955, 3.0904195795374405, 6.245916713455606, 4.964381214465575, 4.464781908979701, 5.303449371759564] |6.245916713455606 |[feel, walker, cap, opposite, hes, perfect, soldier, good, man] |5 | |12 |[4.417557824261202] |4.417557824261202 |[wait] |0 | |13 |[10.243165589453717, 7.24903206351784, 4.681695789912868, 4.598508833806416, 4.474805391370244, 7.679638700854365, 4.622457279712255, 7.943453291899503] |10.243165589453717|[bucky, fall, helicarrier, rescue, steve, w, high, fall, know] |0 | |14 |[4.681695789912868, 4.194351252559876, 4.049792645994886, 2.4379144831788278, 4.6725531803223035, 3.232880775727519, 2.702116474577111] |4.681695789912868 |[love, shit, talk, sam, bucky, show, make] |0 | |15 |[8.078078018938124, 4.715157416456306] |8.078078018938124 |[username, check] |0 | |16 |[5.971442513203123, 5.107095244073235, 3.2983124680830587, 9.288293527892145, 4.397458959197189, 4.274284934339655, 5.196888703210969, 5.6581508016846165, 6.41215610646843, 5.1277606320535245, 11.167954291406257, 6.064264662264575, 3.43943130351661, 8.427941739601088, 4.798316778478119, 6.513505659989033, 4.565035938569167]|11.167954291406257|[yea, kind, assume, scene, hes, throw, shield, hit, target, lake, gma, vibranium, super, light, versatile, material, easy]|10 | |17 |[7.3553267221674625, 4.322497309695291, 6.253886377315836, 3.353223326060119, 4.087226578356274, 7.096082585036216] |7.3553267221674625|[uh, sense, leader, tomorrow, back, line] |0 | |18 |[5.742609005843253, 2.781319626585721] |5.742609005843253 |[ao, mcu] |0 | |19 |[4.491516908486286, 2.4379144831788278, 5.737372359962233, 3.632493324660539] |5.737372359962233 |[episode, make, laugh, hard] |2 | +-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+---------------+ only showing top 20 rows
#get the max value of each idf_list
tfidf_max = tfidf.select(col('text_id'), col('idf_list'), array_max(tfidf.idf_list).alias('max_value'), col('features'), col('max_value_index'))
#get the max idf value of all words and its index in the text
tfidf_max.createOrReplaceTempView("df")
spark.sql("SELECT * FROM df order by max_value desc limit 1").show()
+-------+--------------------+------------------+--------------------+---------------+ |text_id| idf_list| max_value| features|max_value_index| +-------+--------------------+------------------+--------------------+---------------+ | 511587|[13964.774292624299]|13964.774292624299|[heck, heck, heck...| 0| +-------+--------------------+------------------+--------------------+---------------+
According to TF-IDF, the most important word is 'heck'.
media_info = spark.read.parquet("/FileStore/sub_com/media_count/")
media_info_pd_orginal = media_info.toPandas()
media_info_pd = media_info_pd_orginal.sort_values(by=['mention_count'], ascending=False)
media_info_pd = media_info_pd.reset_index(drop=True)
media_info_pd
media_name | release_date | is_movie | box_office (USD) | investment (USD) | mention_count | |
---|---|---|---|---|---|---|
0 | spider-man | 2021-12-17 | True | 1.916307e+09 | 200000000 | 136529 |
1 | loki | 2021-06-09 | False | NaN | 225000000 | 133711 |
2 | thor | 2022-07-08 | True | 7.607553e+08 | 250000000 | 132231 |
3 | wandavision | 2021-01-15 | False | NaN | 200000000 | 59422 |
4 | eternals | 2021-11-05 | True | 4.020649e+08 | 200000000 | 49710 |
5 | Hawkeye | 2021-11-24 | False | NaN | 150000000 | 45844 |
6 | black widow | 2021-07-09 | True | 3.797517e+08 | 200000000 | 41987 |
7 | what if | 2021-08-11 | False | NaN | 150000000 | 40495 |
8 | doctor strange | 2022-05-06 | True | 9.557758e+08 | 200000000 | 31266 |
9 | moon knight | 2022-03-30 | False | NaN | 150000000 | 25062 |
10 | shang-chi | 2021-09-03 | True | 4.322433e+08 | 150000000 | 23290 |
11 | ms marvel | 2022-06-08 | False | NaN | 150000000 | 20818 |
12 | she-hulk | 2022-08-17 | False | NaN | 225000000 | 19844 |
13 | the falcon and the winter soldier | 2021-03-19 | False | NaN | 150000000 | 2479 |
#Determine the top 5 movies or series to answer the "popular" insights for media class
media_info_pd[['media_name', 'mention_count']].head(5)
media_name | mention_count | |
---|---|---|
0 | spider-man | 136529 |
1 | loki | 133711 |
2 | thor | 132231 |
3 | wandavision | 59422 |
4 | eternals | 49710 |
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(26, 12))
p1 = sns.barplot(x="media_name", y="mention_count", data=media_info_pd,dodge=False, color = "#fcbf49", alpha = 0.7)
for container in ax.containers:
ax.bar_label(container, fontsize = 10)
ax.tick_params(axis='x', rotation=45)
p1.set_title("The Popularity of Media in 2021-2022 Aug", fontsize='20')
p1.set_xlabel("Media Name", fontsize = "20")
p1.set_ylabel("Count", fontsize = "20")
Out[93]: Text(0, 0.5, 'Count')
#retrive movie and seires
media_columns = [x for x in sub_com_nlp.columns if (x.startswith('movie:') or x.startswith('series:'))]
import pandas as pd
media_stats = {'media_name': [], 'type': [], 'count': [], 'score': [], 'positive': []}
for media in media_columns:
t, m = media.split(':')
media_stats['type'].append(t)
media_stats['media_name'].append(m)
df_temp = sub_com_nlp.filter((col(media)))
count = df_temp.count()
media_stats['count'].append(count)
sentiment_summary = df_temp.groupby('sentiment').count()
positive_perc = sentiment_summary.withColumn('perc', (col('count') / count)).filter(col('sentiment')=='positive').collect()[0]['perc']
media_stats['positive'].append(positive_perc)
score_avg = df_temp.agg({'score': 'avg'}).collect()[0]['avg(score)']
media_stats['score'].append(score_avg)
media_stats_pd = pd.DataFrame(media_stats)
corr_df = pd.merge(media_stats_pd, imdb, on = "media_name")
sort_movie = media_info_pd_orginal.sort_values('media_name', ascending=False)
sort_movie = sort_movie.reset_index(drop=True)
sort_corr = corr_df.sort_values('media_name', ascending=False)
sort_corr = sort_corr.reset_index(drop=True)
final_corr = pd.merge(sort_corr, sort_movie, on = "media_name")
final_corr = final_corr.drop(columns = ["is_movie", "release_date", "mention_count"])
def hide_current_axis(*args, **kwds):
plt.gca().set_visible(False)
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set(font_scale = 1.1)
g = sns.pairplot(final_corr, hue ="type", markers=["o", "s"], palette = "summer")
g.map_lower(sns.regplot, ci=None)
g.map_upper(hide_current_axis)
(xmin, _), (_, ymax) = g.axes[0, 0].get_position().get_points()
(_, ymin), (xmax, _) = g.axes[-1, -1].get_position().get_points()
ax = g.fig.add_axes([xmin, ymin, xmax - xmin, ymax - ymin], facecolor='none')
corr1 = final_corr[["count", "score", "positive","imdb_rating","box_office (USD)","investment (USD)"]].corr()
mask1 = np.tril(np.ones_like(corr1, dtype=bool))
sns.heatmap(corr1, mask=mask1, vmax=.5, vmin=-.5,
linewidths=.3, cmap="summer", cbar=False, annot=True, annot_kws={'size': 15}, ax=ax)
ax.set_title("Catch the correlation between imdb media ratings, box office, investment and audience reviews", fontsize = "15")
ax.set_xticks([])
ax.set_yticks([])
Out[152]: []
# select the columns for media analysis
media_ls = [i for i in sub_com_nlp.columns if i[:5] == 'movie' or i[:6] == 'series']
media_col_ls = ['created_ts', 'score', 'sentiment'] + media_ls
media_nlp = sub_com_nlp[media_col_ls]
# extract the date from created_ts
media_nlp = media_nlp.withColumn('date', to_date('created_ts'))
# cast the data type of the media columns to integer
for col_name in media_ls:
media_nlp = media_nlp.withColumn(col_name, col(col_name).cast('int'))
media_nlp.printSchema()
root |-- created_ts: timestamp (nullable = true) |-- score: integer (nullable = true) |-- sentiment: string (nullable = true) |-- movie:black widow: integer (nullable = true) |-- movie:shang-chi: integer (nullable = true) |-- movie:eternals: integer (nullable = true) |-- movie:spider-man: integer (nullable = true) |-- movie:doctor strange: integer (nullable = true) |-- movie:thor: integer (nullable = true) |-- series:wandavision: integer (nullable = true) |-- series:the falcon and the winter soldier: integer (nullable = true) |-- series:loki: integer (nullable = true) |-- series:what if: integer (nullable = true) |-- series:hawkeye: integer (nullable = true) |-- series:moon knight: integer (nullable = true) |-- series:ms marvel: integer (nullable = true) |-- series:she-hulk: integer (nullable = true) |-- date: date (nullable = true)
#define a function to analysis the discussion heat of medias based on score and sentimental analysis
def media_heat(dat, score_weight = 0, sentiment_weight = [1,1,1]):
"""
calculate the discussion heat of characters, return a pd.dataframe
dat: pyspark df, have to include date and sentiment column, timestamp is stripped from date
score_weight: >= 0, if set > 0, will give score a weight and assign this weight together with score to each comment, default is 0
sentiment_weight: list, contains weight on the sentiment for positive, neutral, negative; 1,1,1 will add up all the comments
"""
if score_weight != 0:
for col_name in media_ls:
dat = dat.withColumn(col_name, col(col_name)*col(score)*score_weight)
dat_pd = dat.groupby('date', 'sentiment').sum().toPandas()
dat_pd = dat_pd.sort_values(by = ['date', 'sentiment'])
dat_pd.columns = ['date'] + media_col_ls[1:]
for m in media_ls:
dat_pd[m] = dat_pd[m]*np.where(dat_pd['sentiment'] == 'positive', sentiment_weight[0], np.where(dat_pd['sentiment'] == 'negative', sentiment_weight[2], sentiment_weight[1]))
dat_pd = dat_pd.groupby('date').sum().reset_index()
dat_pd.columns = ['date', 'score'] + media_ls
return dat_pd
media_nlp_pd = media_heat(media_nlp)
#change column name, generate the final heat table of each media
media_heat_pd = media_nlp_pd[['date']+media_ls]
media_rename_ls = ['date'] + [i[6:] for i in media_ls[0:6]] + [j[7:] for j in media_ls[6:]]
media_heat_pd.columns = media_rename_ls
media_heat_pd
date | black widow | shang-chi | eternals | spider-man | doctor strange | thor | wandavision | the falcon and the winter soldier | loki | what if | hawkeye | moon knight | ms marvel | she-hulk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-01-01 | 13 | 4 | 7 | 34 | 7 | 40 | 33 | 1 | 24 | 4 | 12 | 4 | 2 | 4 |
1 | 2021-01-02 | 34 | 5 | 10 | 50 | 7 | 58 | 33 | 3 | 25 | 9 | 22 | 7 | 15 | 13 |
2 | 2021-01-03 | 32 | 8 | 11 | 54 | 14 | 181 | 32 | 0 | 43 | 7 | 11 | 2 | 9 | 22 |
3 | 2021-01-04 | 16 | 6 | 3 | 52 | 8 | 61 | 34 | 2 | 32 | 5 | 16 | 2 | 11 | 4 |
4 | 2021-01-05 | 21 | 6 | 9 | 55 | 22 | 43 | 78 | 1 | 45 | 23 | 20 | 3 | 11 | 11 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
603 | 2022-08-27 | 57 | 64 | 51 | 183 | 58 | 224 | 95 | 0 | 110 | 36 | 86 | 66 | 93 | 518 |
604 | 2022-08-28 | 46 | 55 | 39 | 151 | 76 | 164 | 162 | 3 | 211 | 121 | 146 | 131 | 140 | 262 |
605 | 2022-08-29 | 19 | 7 | 13 | 145 | 18 | 145 | 31 | 0 | 63 | 30 | 35 | 34 | 26 | 119 |
606 | 2022-08-30 | 31 | 49 | 31 | 133 | 21 | 190 | 32 | 2 | 60 | 22 | 34 | 33 | 64 | 246 |
607 | 2022-08-31 | 24 | 51 | 33 | 135 | 31 | 143 | 31 | 1 | 67 | 13 | 36 | 46 | 38 | 167 |
608 rows × 15 columns
## visualize
def visulize_media_heat(dat, timeline, topn = 10):
rank_df = dat.sum().sort_values(ascending = False)
char_chose = list(rank_df[:topn].index)
dat_viz = dat[['date']+char_chose]
dat_viz = pd.melt(dat_viz, id_vars = ['date'], value_vars = char_chose)
dat_viz.columns = ['date', 'media', 'heat']
fig = px.line(dat_viz, x = 'date', y = 'heat', color = 'media', width = 1100, height = 600, title = 'Discussion Heat of Top 10 Marval Medias on Reddit')
fig.update_layout(plot_bgcolor = '#FCFBF8', margin = dict(t = 70, l = 20, b = 20, r = 20), font_family = 'Roboto Slab', xaxis_title = 'Date',
yaxis_title = 'Heat', title = {'font': {'size':24}})
for index, row in timeline.iterrows():
dt = row['release_date']
pos = int((datetime(dt.year, dt.month, dt.day) - datetime(1970,1,1)).total_seconds()) * 1000
fig.add_vline(x = pos, line_width = 1, line_dash = 'dot', opacity = 0.7)
fig.add_annotation(x = pos+1, y = rd.randint(2000, 4000), text = row['media_name'], showarrow = True, arrowhead = 1)
fig.show()
fig.write_html("media_heat_viz.html")
visulize_media_heat(dat = media_heat_pd, timeline = media_timeline_pd, topn = 10)
<command-1027951213072273>:3: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
#the most popular marvel character over all
df_char.createOrReplaceTempView("character")
spark.sql("SELECT * FROM character order by mention_count desc limit 1").show()
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+ | name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count| +-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+ |wanda| 100| 10| 29| 70| 100| 80|Female|Mutant| bad| 186333| +-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
#the most popular female character
spark.sql("SELECT * FROM character where gender like 'Female' order by mention_count desc limit 1").show()
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+ | name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count| +-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+ |wanda| 100| 10| 29| 70| 100| 80|Female|Mutant| bad| 186333| +-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
#the most popular male character
spark.sql("SELECT * FROM character where gender like 'Male' order by mention_count desc limit 1").show()
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+ | name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count| +----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+ |spider-man| 90| 55| 67| 75| 74| 85| Male|Human| good| 146361| +----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
#the most popular hero character
spark.sql("SELECT * FROM character where alignment like 'good' order by mention_count desc limit 1").show()
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+ | name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count| +----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+ |spider-man| 90| 55| 67| 75| 74| 85| Male|Human| good| 146361| +----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
#the most popular villain character
spark.sql("SELECT * FROM character where alignment like 'bad' order by mention_count desc limit 1").show()
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+ | name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count| +-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+ |wanda| 100| 10| 29| 70| 100| 80|Female|Mutant| bad| 186333| +-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
#the most popular human character
spark.sql("SELECT * FROM character where race like 'Human' order by mention_count desc limit 1").show()
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+ | name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count| +----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+ |spider-man| 90| 55| 67| 75| 74| 85| Male|Human| good| 146361| +----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
# select the columns for characters analysis
char_ls = [i for i in sub_com_nlp.columns if i[:9] == 'character']
col_ls = ['created_ts', 'score', 'sentiment'] + char_ls
char_nlp = sub_com_nlp[col_ls]
# extract the date from created_ts
char_nlp = char_nlp.withColumn('date', to_date('created_ts'))
# cast the data type of the character columns to integer
for col_name in char_ls:
char_nlp = char_nlp.withColumn(col_name, col(col_name).cast('int'))
def char_heat(dat, score_weight = 0, sentiment_weight = [1,1,1]):
"""
calculate the discussion heat of characters, return a pd.dataframe
dat: pyspark df, have to include date and sentiment column, timestamp is stripped from date
score_weight: >= 0, if set > 0, will give score a weight and assign this weight together with score to each comment, default is 0
sentiment_weight: list, contains weight on the sentiment for positive, neutral, negative; 1,1,1 will add up all the comments
"""
if score_weight != 0:
for col_name in char_ls:
dat = dat.withColumn(col_name, col(col_name)*col(score)*score_weight)
dat_pd = dat.groupby('date', 'sentiment').sum().toPandas()
dat_pd = dat_pd.sort_values(by = ['date', 'sentiment'])
dat_pd.columns = ['date'] + col_ls[1:]
for char in char_ls:
dat_pd[char] = dat_pd[char]*np.where(dat_pd['sentiment'] == 'positive', sentiment_weight[0], np.where(dat_pd['sentiment'] == 'negative', sentiment_weight[2], sentiment_weight[1]))
dat_pd = dat_pd.groupby('date').sum().reset_index()
dat_pd.columns = ['date', 'score'] + char_ls
return dat_pd
char_nlp_pd = char_heat(char_nlp)
char_heat_pd = char_nlp_pd[['date']+char_ls]
rename_ls = ['date'] + [i[10:] for i in char_ls]
char_heat_pd.columns = rename_ls
## visualize
def visulize_char_heat(dat, timeline, topn = 10):
rank_df = dat.sum().sort_values(ascending = False)
char_chose = list(rank_df[:10].index)
dat_viz = dat[['date']+char_chose]
dat_viz = pd.melt(dat_viz, id_vars = ['date'], value_vars = char_chose)
dat_viz.columns = ['date', 'character', 'heat']
fig = px.line(dat_viz, x = 'date', y = 'heat', color = 'character', width = 1100, height = 600, title = 'Discussion Heat of Marval Characters on Reddit')
fig.update_layout(plot_bgcolor = '#FCFBF8', margin = dict(t = 70, l = 20, b = 20, r = 20), font_family = 'Roboto Slab', xaxis_title = 'Date',
yaxis_title = 'Heat', title = {'font': {'size':24}})
for index, row in timeline.iterrows():
dt = row['release_date']
pos = int((datetime(dt.year, dt.month, dt.day) - datetime(1970,1,1)).total_seconds()) * 1000
fig.add_vline(x = pos, line_width = 1, line_dash = 'dot', opacity = 0.7)
fig.add_annotation(x = pos+1, y = rd.randint(2000, 4000), text = row['media_name'], showarrow = True, arrowhead = 1)
fig.show()
fig.write_html("char_heat_viz.html")
visulize_char_heat(dat = char_heat_pd, timeline = media_timeline_pd, topn = 10)
<command-1027951213072224>:4: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
import numpy as np
character_columns = [c for c in sub_com_nlp.columns if c.startswith('character:')]
num_character = len(character_columns)
char_cooccur_count = np.zeros((num_character, num_character))
char_cooccur_positve_perc = np.zeros((num_character, num_character))
char_cooccur_score = np.zeros((num_character, num_character))
num_steps_required = int(num_character * (num_character + 1) / 2)
acc = 0
for i in range(num_character):
for j in range(i, num_character):
char1 = character_columns[i]
char2 = character_columns[j]
if i == j:
df_temp = sub_com_nlp.filter((col(char1)))
else:
df_temp = sub_com_nlp.filter((col(char1)) & (col(char2)))
acc += 1
print(char1, char2, str(acc)+'/'+str(num_steps_required)+' '*20, end='\r', sep=' | ')
count = df_temp.count()
if count == 0:
positive_perc = 0
score_avg = 0
else:
sentiment_summary = df_temp.groupby('sentiment').count()
positive_perc = sentiment_summary.withColumn('perc', (col('count') / count)).filter(col('sentiment')=='positive').collect()
if len(positive_perc) > 0:
positive_perc = positive_perc[0]['perc']
else:
positive_perc = 0.0
score_avg = df_temp.agg({'score': 'avg'}).collect()[0]['avg(score)']
char_cooccur_count[i, j] = char_cooccur_count[j, i] = count
char_cooccur_positve_perc[i, j] = char_cooccur_positve_perc[j, i] = positive_perc
char_cooccur_score[i, j] = char_cooccur_score[j, i] = score_avg
character:black widow | character:black widow | 1/1275 character:black widow | character:yelena belova | 2/1275 character:black widow | character:shang-chi | 3/1275 character:black widow | character:sersi | 4/1275 character:black widow | character:ikaris | 5/1275 character:black widow | character:thena | 6/1275 character:black widow | character:ajak | 7/1275 character:black widow | character:spider-man | 8/1275 character:black widow | character:doctor strange | 9/1275 character:black widow | character:electro | 10/1275 character:black widow | character:green goblin | 11/1275 character:black widow | character:doc ock | 12/1275 character:black widow | character:wong | 13/1275 character:black widow | character:wanda | 14/1275 character:black widow | character:thor | 15/1275 character:black widow | character:jane foster | 16/1275 character:black widow | character:gorr | 17/1275 character:black widow | character:vision | 18/1275 character:black widow | character:agnes | 19/1275 character:black widow | character:falcon | 20/1275 character:black widow | character:bucky | 21/1275 character:black widow | character:john walker | 22/1275 character:black widow | character:captain america | 23/1275 character:black widow | character:loki | 24/1275 character:black widow | character:casey | 25/1275 character:black widow | character:the watcher | 26/1275 character:black widow | character:nick fury | 27/1275 character:black widow | character:iron man | 28/1275 character:black widow | character:hawkeye | 29/1275 character:black widow | character:ultron | 30/1275 character:black widow | character:red skull | 31/1275 character:black widow | character:captain marvel | 32/1275 character:black widow | character:captain carter | 33/1275 character:black widow | character:hulk | 34/1275 character:black widow | character:nebula | 35/1275 character:black widow | character:hank pym | 36/1275 character:black widow | character:ant-man | 37/1275 character:black widow | character:thanos | 38/1275 character:black widow | character:kate bishop | 39/1275 character:black widow | character:kingpin | 40/1275 character:black widow | character:moon knight | 41/1275 character:black widow | character:arthur harrow | 42/1275 character:black widow | character:kamala khan | 43/1275 character:black widow | character:she-hulk | 44/1275 character:black widow | character:abomination | 45/1275 character:black widow | character:odin | 46/1275 character:black widow | character:pepper potts | 47/1275 character:black widow | character:mj | 48/1275 character:black widow | character:ned | 49/1275 character:black widow | character:happy | 50/1275 character:yelena belova | character:yelena belova | 51/1275 character:yelena belova | character:shang-chi | 52/1275 character:yelena belova | character:sersi | 53/1275 character:yelena belova | character:ikaris | 54/1275 character:yelena belova | character:thena | 55/1275 character:yelena belova | character:ajak | 56/1275 character:yelena belova | character:spider-man | 57/1275 character:yelena belova | character:doctor strange | 58/1275 character:yelena belova | character:electro | 59/1275 character:yelena belova | character:green goblin | 60/1275 character:yelena belova | character:doc ock | 61/1275 character:yelena belova | character:wong | 62/1275 character:yelena belova | character:wanda | 63/1275 character:yelena belova | character:thor | 64/1275 character:yelena belova | character:jane foster | 65/1275 character:yelena belova | character:gorr | 66/1275 character:yelena belova | character:vision | 67/1275 character:yelena belova | character:agnes | 68/1275 character:yelena belova | character:falcon | 69/1275 character:yelena belova | character:bucky | 70/1275 character:yelena belova | character:john walker | 71/1275 character:yelena belova | character:captain america | 72/1275 character:yelena belova | character:loki | 73/1275 character:yelena belova | character:casey | 74/1275 character:yelena belova | character:the watcher | 75/1275 character:yelena belova | character:nick fury | 76/1275 character:yelena belova | character:iron man | 77/1275 character:yelena belova | character:hawkeye | 78/1275 character:yelena belova | character:ultron | 79/1275 character:yelena belova | character:red skull | 80/1275 character:yelena belova | character:captain marvel | 81/1275 character:yelena belova | character:captain carter | 82/1275 character:yelena belova | character:hulk | 83/1275 character:yelena belova | character:nebula | 84/1275 character:yelena belova | character:hank pym | 85/1275 character:yelena belova | character:ant-man | 86/1275 character:yelena belova | character:thanos | 87/1275 character:yelena belova | character:kate bishop | 88/1275 character:yelena belova | character:kingpin | 89/1275 character:yelena belova | character:moon knight | 90/1275 character:yelena belova | character:arthur harrow | 91/1275 character:yelena belova | character:kamala khan | 92/1275 character:yelena belova | character:she-hulk | 93/1275 character:yelena belova | character:abomination | 94/1275 character:yelena belova | character:odin | 95/1275 character:yelena belova | character:pepper potts | 96/1275 character:yelena belova | character:mj | 97/1275 character:yelena belova | character:ned | 98/1275 character:yelena belova | character:happy | 99/1275 character:shang-chi | character:shang-chi | 100/1275 character:shang-chi | character:sersi | 101/1275 character:shang-chi | character:ikaris | 102/1275 character:shang-chi | character:thena | 103/1275 character:shang-chi | character:ajak | 104/1275 character:shang-chi | character:spider-man | 105/1275 character:shang-chi | character:doctor strange | 106/1275 character:shang-chi | character:electro | 107/1275 character:shang-chi | character:green goblin | 108/1275 character:shang-chi | character:doc ock | 109/1275 character:shang-chi | character:wong | 110/1275 character:shang-chi | character:wanda | 111/1275 character:shang-chi | character:thor | 112/1275 character:shang-chi | character:jane foster | 113/1275 character:shang-chi | character:gorr | 114/1275 character:shang-chi | character:vision | 115/1275 character:shang-chi | character:agnes | 116/1275 character:shang-chi | character:falcon | 117/1275 character:shang-chi | character:bucky | 118/1275 character:shang-chi | character:john walker | 119/1275 character:shang-chi | character:captain america | 120/1275 character:shang-chi | character:loki | 121/1275 character:shang-chi | character:casey | 122/1275 character:shang-chi | character:the watcher | 123/1275 character:shang-chi | character:nick fury | 124/1275 character:shang-chi | character:iron man | 125/1275 character:shang-chi | character:hawkeye | 126/1275 character:shang-chi | character:ultron | 127/1275 character:shang-chi | character:red skull | 128/1275 character:shang-chi | character:captain marvel | 129/1275 character:shang-chi | character:captain carter | 130/1275 character:shang-chi | character:hulk | 131/1275 character:shang-chi | character:nebula | 132/1275 character:shang-chi | character:hank pym | 133/1275 character:shang-chi | character:ant-man | 134/1275 character:shang-chi | character:thanos | 135/1275 character:shang-chi | character:kate bishop | 136/1275 character:shang-chi | character:kingpin | 137/1275 character:shang-chi | character:moon knight | 138/1275 character:shang-chi | character:arthur harrow | 139/1275 character:shang-chi | character:kamala khan | 140/1275 character:shang-chi | character:she-hulk | 141/1275 character:shang-chi | character:abomination | 142/1275 character:shang-chi | character:odin | 143/1275 character:shang-chi | character:pepper potts | 144/1275 character:shang-chi | character:mj | 145/1275 character:shang-chi | character:ned | 146/1275 character:shang-chi | character:happy | 147/1275 character:sersi | character:sersi | 148/1275 character:sersi | character:ikaris | 149/1275 character:sersi | character:thena | 150/1275 character:sersi | character:ajak | 151/1275 character:sersi | character:spider-man | 152/1275 character:sersi | character:doctor strange | 153/1275 character:sersi | character:electro | 154/1275 character:sersi | character:green goblin | 155/1275 character:sersi | character:doc ock | 156/1275 character:sersi | character:wong | 157/1275 character:sersi | character:wanda | 158/1275 character:sersi | character:thor | 159/1275 character:sersi | character:jane foster | 160/1275 character:sersi | character:gorr | 161/1275 character:sersi | character:vision | 162/1275 character:sersi | character:agnes | 163/1275 character:sersi | character:falcon | 164/1275 character:sersi | character:bucky | 165/1275 character:sersi | character:john walker | 166/1275 character:sersi | character:captain america | 167/1275 character:sersi | character:loki | 168/1275 character:sersi | character:casey | 169/1275 character:sersi | character:the watcher | 170/1275 character:sersi | character:nick fury | 171/1275 character:sersi | character:iron man | 172/1275 character:sersi | character:hawkeye | 173/1275 character:sersi | character:ultron | 174/1275 character:sersi | character:red skull | 175/1275 character:sersi | character:captain marvel | 176/1275 character:sersi | character:captain carter | 177/1275 character:sersi | character:hulk | 178/1275 character:sersi | character:nebula | 179/1275 character:sersi | character:hank pym | 180/1275 character:sersi | character:ant-man | 181/1275 character:sersi | character:thanos | 182/1275 character:sersi | character:kate bishop | 183/1275 character:sersi | character:kingpin | 184/1275 character:sersi | character:moon knight | 185/1275 character:sersi | character:arthur harrow | 186/1275 character:sersi | character:kamala khan | 187/1275 character:sersi | character:she-hulk | 188/1275 character:sersi | character:abomination | 189/1275 character:sersi | character:odin | 190/1275 character:sersi | character:pepper potts | 191/1275 character:sersi | character:mj | 192/1275 character:sersi | character:ned | 193/1275 character:sersi | character:happy | 194/1275 character:ikaris | character:ikaris | 195/1275 character:ikaris | character:thena | 196/1275 character:ikaris | character:ajak | 197/1275 character:ikaris | character:spider-man | 198/1275 character:ikaris | character:doctor strange | 199/1275 character:ikaris | character:electro | 200/1275 character:ikaris | character:green goblin | 201/1275 character:ikaris | character:doc ock | 202/1275 character:ikaris | character:wong | 203/1275 character:ikaris | character:wanda | 204/1275 character:ikaris | character:thor | 205/1275 character:ikaris | character:jane foster | 206/1275 character:ikaris | character:gorr | 207/1275 character:ikaris | character:vision | 208/1275 character:ikaris | character:agnes | 209/1275 character:ikaris | character:falcon | 210/1275 character:ikaris | character:bucky | 211/1275 character:ikaris | character:john walker | 212/1275 character:ikaris | character:captain america | 213/1275 character:ikaris | character:loki | 214/1275 character:ikaris | character:casey | 215/1275 character:ikaris | character:the watcher | 216/1275 character:ikaris | character:nick fury | 217/1275 character:ikaris | character:iron man | 218/1275 character:ikaris | character:hawkeye | 219/1275 character:ikaris | character:ultron | 220/1275 character:ikaris | character:red skull | 221/1275 character:ikaris | character:captain marvel | 222/1275 character:ikaris | character:captain carter | 223/1275 character:ikaris | character:hulk | 224/1275 character:ikaris | character:nebula | 225/1275 character:ikaris | character:hank pym | 226/1275 character:ikaris | character:ant-man | 227/1275 character:ikaris | character:thanos | 228/1275 character:ikaris | character:kate bishop | 229/1275 character:ikaris | character:kingpin | 230/1275 character:ikaris | character:moon knight | 231/1275 character:ikaris | character:arthur harrow | 232/1275 character:ikaris | character:kamala khan | 233/1275 character:ikaris | character:she-hulk | 234/1275 character:ikaris | character:abomination | 235/1275 character:ikaris | character:odin | 236/1275 character:ikaris | character:pepper potts | 237/1275 character:ikaris | character:mj | 238/1275 character:ikaris | character:ned | 239/1275 character:ikaris | character:happy | 240/1275 character:thena | character:thena | 241/1275 character:thena | character:ajak | 242/1275 character:thena | character:spider-man | 243/1275 character:thena | character:doctor strange | 244/1275 character:thena | character:electro | 245/1275 character:thena | character:green goblin | 246/1275 character:thena | character:doc ock | 247/1275 character:thena | character:wong | 248/1275 character:thena | character:wanda | 249/1275 character:thena | character:thor | 250/1275 character:thena | character:jane foster | 251/1275 character:thena | character:gorr | 252/1275 character:thena | character:vision | 253/1275 character:thena | character:agnes | 254/1275 character:thena | character:falcon | 255/1275 character:thena | character:bucky | 256/1275 character:thena | character:john walker | 257/1275 character:thena | character:captain america | 258/1275 character:thena | character:loki | 259/1275 character:thena | character:casey | 260/1275 character:thena | character:the watcher | 261/1275 character:thena | character:nick fury | 262/1275 character:thena | character:iron man | 263/1275 character:thena | character:hawkeye | 264/1275 character:thena | character:ultron | 265/1275 character:thena | character:red skull | 266/1275 character:thena | character:captain marvel | 267/1275 character:thena | character:captain carter | 268/1275 character:thena | character:hulk | 269/1275 character:thena | character:nebula | 270/1275 character:thena | character:hank pym | 271/1275 character:thena | character:ant-man | 272/1275 character:thena | character:thanos | 273/1275 character:thena | character:kate bishop | 274/1275 character:thena | character:kingpin | 275/1275 character:thena | character:moon knight | 276/1275 character:thena | character:arthur harrow | 277/1275 character:thena | character:kamala khan | 278/1275 character:thena | character:she-hulk | 279/1275 character:thena | character:abomination | 280/1275 character:thena | character:odin | 281/1275 character:thena | character:pepper potts | 282/1275 character:thena | character:mj | 283/1275 character:thena | character:ned | 284/1275 character:thena | character:happy | 285/1275 character:ajak | character:ajak | 286/1275 character:ajak | character:spider-man | 287/1275 character:ajak | character:doctor strange | 288/1275 character:ajak | character:electro | 289/1275 character:ajak | character:green goblin | 290/1275 character:ajak | character:doc ock | 291/1275 character:ajak | character:wong | 292/1275 character:ajak | character:wanda | 293/1275 character:ajak | character:thor | 294/1275 character:ajak | character:jane foster | 295/1275 character:ajak | character:gorr | 296/1275 character:ajak | character:vision | 297/1275 character:ajak | character:agnes | 298/1275 character:ajak | character:falcon | 299/1275 character:ajak | character:bucky | 300/1275 character:ajak | character:john walker | 301/1275 character:ajak | character:captain america | 302/1275 character:ajak | character:loki | 303/1275 character:ajak | character:casey | 304/1275 character:ajak | character:the watcher | 305/1275 character:ajak | character:nick fury | 306/1275 character:ajak | character:iron man | 307/1275 character:ajak | character:hawkeye | 308/1275 character:ajak | character:ultron | 309/1275 character:ajak | character:red skull | 310/1275 character:ajak | character:captain marvel | 311/1275 character:ajak | character:captain carter | 312/1275 character:ajak | character:hulk | 313/1275 character:ajak | character:nebula | 314/1275 character:ajak | character:hank pym | 315/1275 character:ajak | character:ant-man | 316/1275 character:ajak | character:thanos | 317/1275 character:ajak | character:kate bishop | 318/1275 character:ajak | character:kingpin | 319/1275 character:ajak | character:moon knight | 320/1275 character:ajak | character:arthur harrow | 321/1275 character:ajak | character:kamala khan | 322/1275 character:ajak | character:she-hulk | 323/1275 character:ajak | character:abomination | 324/1275 character:ajak | character:odin | 325/1275 character:ajak | character:pepper potts | 326/1275 character:ajak | character:mj | 327/1275 character:ajak | character:ned | 328/1275 character:ajak | character:happy | 329/1275 character:spider-man | character:spider-man | 330/1275 character:spider-man | character:doctor strange | 331/1275 character:spider-man | character:electro | 332/1275 character:spider-man | character:green goblin | 333/1275 character:spider-man | character:doc ock | 334/1275 character:spider-man | character:wong | 335/1275 character:spider-man | character:wanda | 336/1275 character:spider-man | character:thor | 337/1275 character:spider-man | character:jane foster | 338/1275 character:spider-man | character:gorr | 339/1275 character:spider-man | character:vision | 340/1275 character:spider-man | character:agnes | 341/1275 character:spider-man | character:falcon | 342/1275 character:spider-man | character:bucky | 343/1275 character:spider-man | character:john walker | 344/1275 character:spider-man | character:captain america | 345/1275 character:spider-man | character:loki | 346/1275 character:spider-man | character:casey | 347/1275 character:spider-man | character:the watcher | 348/1275 character:spider-man | character:nick fury | 349/1275 character:spider-man | character:iron man | 350/1275 character:spider-man | character:hawkeye | 351/1275 character:spider-man | character:ultron | 352/1275 character:spider-man | character:red skull | 353/1275 character:spider-man | character:captain marvel | 354/1275 character:spider-man | character:captain carter | 355/1275 character:spi *** WARNING: max output size exceeded, skipping output. *** eye | 929/1275 character:casey | character:ultron | 930/1275 character:casey | character:red skull | 931/1275 character:casey | character:captain marvel | 932/1275 character:casey | character:captain carter | 933/1275 character:casey | character:hulk | 934/1275 character:casey | character:nebula | 935/1275 character:casey | character:hank pym | 936/1275 character:casey | character:ant-man | 937/1275 character:casey | character:thanos | 938/1275 character:casey | character:kate bishop | 939/1275 character:casey | character:kingpin | 940/1275 character:casey | character:moon knight | 941/1275 character:casey | character:arthur harrow | 942/1275 character:casey | character:kamala khan | 943/1275 character:casey | character:she-hulk | 944/1275 character:casey | character:abomination | 945/1275 character:casey | character:odin | 946/1275 character:casey | character:pepper potts | 947/1275 character:casey | character:mj | 948/1275 character:casey | character:ned | 949/1275 character:casey | character:happy | 950/1275 character:the watcher | character:the watcher | 951/1275 character:the watcher | character:nick fury | 952/1275 character:the watcher | character:iron man | 953/1275 character:the watcher | character:hawkeye | 954/1275 character:the watcher | character:ultron | 955/1275 character:the watcher | character:red skull | 956/1275 character:the watcher | character:captain marvel | 957/1275 character:the watcher | character:captain carter | 958/1275 character:the watcher | character:hulk | 959/1275 character:the watcher | character:nebula | 960/1275 character:the watcher | character:hank pym | 961/1275 character:the watcher | character:ant-man | 962/1275 character:the watcher | character:thanos | 963/1275 character:the watcher | character:kate bishop | 964/1275 character:the watcher | character:kingpin | 965/1275 character:the watcher | character:moon knight | 966/1275 character:the watcher | character:arthur harrow | 967/1275 character:the watcher | character:kamala khan | 968/1275 character:the watcher | character:she-hulk | 969/1275 character:the watcher | character:abomination | 970/1275 character:the watcher | character:odin | 971/1275 character:the watcher | character:pepper potts | 972/1275 character:the watcher | character:mj | 973/1275 character:the watcher | character:ned | 974/1275 character:the watcher | character:happy | 975/1275 character:nick fury | character:nick fury | 976/1275 character:nick fury | character:iron man | 977/1275 character:nick fury | character:hawkeye | 978/1275 character:nick fury | character:ultron | 979/1275 character:nick fury | character:red skull | 980/1275 character:nick fury | character:captain marvel | 981/1275 character:nick fury | character:captain carter | 982/1275 character:nick fury | character:hulk | 983/1275 character:nick fury | character:nebula | 984/1275 character:nick fury | character:hank pym | 985/1275 character:nick fury | character:ant-man | 986/1275 character:nick fury | character:thanos | 987/1275 character:nick fury | character:kate bishop | 988/1275 character:nick fury | character:kingpin | 989/1275 character:nick fury | character:moon knight | 990/1275 character:nick fury | character:arthur harrow | 991/1275 character:nick fury | character:kamala khan | 992/1275 character:nick fury | character:she-hulk | 993/1275 character:nick fury | character:abomination | 994/1275 character:nick fury | character:odin | 995/1275 character:nick fury | character:pepper potts | 996/1275 character:nick fury | character:mj | 997/1275 character:nick fury | character:ned | 998/1275 character:nick fury | character:happy | 999/1275 character:iron man | character:iron man | 1000/1275 character:iron man | character:hawkeye | 1001/1275 character:iron man | character:ultron | 1002/1275 character:iron man | character:red skull | 1003/1275 character:iron man | character:captain marvel | 1004/1275 character:iron man | character:captain carter | 1005/1275 character:iron man | character:hulk | 1006/1275 character:iron man | character:nebula | 1007/1275 character:iron man | character:hank pym | 1008/1275 character:iron man | character:ant-man | 1009/1275 character:iron man | character:thanos | 1010/1275 character:iron man | character:kate bishop | 1011/1275 character:iron man | character:kingpin | 1012/1275 character:iron man | character:moon knight | 1013/1275 character:iron man | character:arthur harrow | 1014/1275 character:iron man | character:kamala khan | 1015/1275 character:iron man | character:she-hulk | 1016/1275 character:iron man | character:abomination | 1017/1275 character:iron man | character:odin | 1018/1275 character:iron man | character:pepper potts | 1019/1275 character:iron man | character:mj | 1020/1275 character:iron man | character:ned | 1021/1275 character:iron man | character:happy | 1022/1275 character:hawkeye | character:hawkeye | 1023/1275 character:hawkeye | character:ultron | 1024/1275 character:hawkeye | character:red skull | 1025/1275 character:hawkeye | character:captain marvel | 1026/1275 character:hawkeye | character:captain carter | 1027/1275 character:hawkeye | character:hulk | 1028/1275 character:hawkeye | character:nebula | 1029/1275 character:hawkeye | character:hank pym | 1030/1275 character:hawkeye | character:ant-man | 1031/1275 character:hawkeye | character:thanos | 1032/1275 character:hawkeye | character:kate bishop | 1033/1275 character:hawkeye | character:kingpin | 1034/1275 character:hawkeye | character:moon knight | 1035/1275 character:hawkeye | character:arthur harrow | 1036/1275 character:hawkeye | character:kamala khan | 1037/1275 character:hawkeye | character:she-hulk | 1038/1275 character:hawkeye | character:abomination | 1039/1275 character:hawkeye | character:odin | 1040/1275 character:hawkeye | character:pepper potts | 1041/1275 character:hawkeye | character:mj | 1042/1275 character:hawkeye | character:ned | 1043/1275 character:hawkeye | character:happy | 1044/1275 character:ultron | character:ultron | 1045/1275 character:ultron | character:red skull | 1046/1275 character:ultron | character:captain marvel | 1047/1275 character:ultron | character:captain carter | 1048/1275 character:ultron | character:hulk | 1049/1275 character:ultron | character:nebula | 1050/1275 character:ultron | character:hank pym | 1051/1275 character:ultron | character:ant-man | 1052/1275 character:ultron | character:thanos | 1053/1275 character:ultron | character:kate bishop | 1054/1275 character:ultron | character:kingpin | 1055/1275 character:ultron | character:moon knight | 1056/1275 character:ultron | character:arthur harrow | 1057/1275 character:ultron | character:kamala khan | 1058/1275 character:ultron | character:she-hulk | 1059/1275 character:ultron | character:abomination | 1060/1275 character:ultron | character:odin | 1061/1275 character:ultron | character:pepper potts | 1062/1275 character:ultron | character:mj | 1063/1275 character:ultron | character:ned | 1064/1275 character:ultron | character:happy | 1065/1275 character:red skull | character:red skull | 1066/1275 character:red skull | character:captain marvel | 1067/1275 character:red skull | character:captain carter | 1068/1275 character:red skull | character:hulk | 1069/1275 character:red skull | character:nebula | 1070/1275 character:red skull | character:hank pym | 1071/1275 character:red skull | character:ant-man | 1072/1275 character:red skull | character:thanos | 1073/1275 character:red skull | character:kate bishop | 1074/1275 character:red skull | character:kingpin | 1075/1275 character:red skull | character:moon knight | 1076/1275 character:red skull | character:arthur harrow | 1077/1275 character:red skull | character:kamala khan | 1078/1275 character:red skull | character:she-hulk | 1079/1275 character:red skull | character:abomination | 1080/1275 character:red skull | character:odin | 1081/1275 character:red skull | character:pepper potts | 1082/1275 character:red skull | character:mj | 1083/1275 character:red skull | character:ned | 1084/1275 character:red skull | character:happy | 1085/1275 character:captain marvel | character:captain marvel | 1086/1275 character:captain marvel | character:captain carter | 1087/1275 character:captain marvel | character:hulk | 1088/1275 character:captain marvel | character:nebula | 1089/1275 character:captain marvel | character:hank pym | 1090/1275 character:captain marvel | character:ant-man | 1091/1275 character:captain marvel | character:thanos | 1092/1275 character:captain marvel | character:kate bishop | 1093/1275 character:captain marvel | character:kingpin | 1094/1275 character:captain marvel | character:moon knight | 1095/1275 character:captain marvel | character:arthur harrow | 1096/1275 character:captain marvel | character:kamala khan | 1097/1275 character:captain marvel | character:she-hulk | 1098/1275 character:captain marvel | character:abomination | 1099/1275 character:captain marvel | character:odin | 1100/1275 character:captain marvel | character:pepper potts | 1101/1275 character:captain marvel | character:mj | 1102/1275 character:captain marvel | character:ned | 1103/1275 character:captain marvel | character:happy | 1104/1275 character:captain carter | character:captain carter | 1105/1275 character:captain carter | character:hulk | 1106/1275 character:captain carter | character:nebula | 1107/1275 character:captain carter | character:hank pym | 1108/1275 character:captain carter | character:ant-man | 1109/1275 character:captain carter | character:thanos | 1110/1275 character:captain carter | character:kate bishop | 1111/1275 character:captain carter | character:kingpin | 1112/1275 character:captain carter | character:moon knight | 1113/1275 character:captain carter | character:arthur harrow | 1114/1275 character:captain carter | character:kamala khan | 1115/1275 character:captain carter | character:she-hulk | 1116/1275 character:captain carter | character:abomination | 1117/1275 character:captain carter | character:odin | 1118/1275 character:captain carter | character:pepper potts | 1119/1275 character:captain carter | character:mj | 1120/1275 character:captain carter | character:ned | 1121/1275 character:captain carter | character:happy | 1122/1275 character:hulk | character:hulk | 1123/1275 character:hulk | character:nebula | 1124/1275 character:hulk | character:hank pym | 1125/1275 character:hulk | character:ant-man | 1126/1275 character:hulk | character:thanos | 1127/1275 character:hulk | character:kate bishop | 1128/1275 character:hulk | character:kingpin | 1129/1275 character:hulk | character:moon knight | 1130/1275 character:hulk | character:arthur harrow | 1131/1275 character:hulk | character:kamala khan | 1132/1275 character:hulk | character:she-hulk | 1133/1275 character:hulk | character:abomination | 1134/1275 character:hulk | character:odin | 1135/1275 character:hulk | character:pepper potts | 1136/1275 character:hulk | character:mj | 1137/1275 character:hulk | character:ned | 1138/1275 character:hulk | character:happy | 1139/1275 character:nebula | character:nebula | 1140/1275 character:nebula | character:hank pym | 1141/1275 character:nebula | character:ant-man | 1142/1275 character:nebula | character:thanos | 1143/1275 character:nebula | character:kate bishop | 1144/1275 character:nebula | character:kingpin | 1145/1275 character:nebula | character:moon knight | 1146/1275 character:nebula | character:arthur harrow | 1147/1275 character:nebula | character:kamala khan | 1148/1275 character:nebula | character:she-hulk | 1149/1275 character:nebula | character:abomination | 1150/1275 character:nebula | character:odin | 1151/1275 character:nebula | character:pepper potts | 1152/1275 character:nebula | character:mj | 1153/1275 character:nebula | character:ned | 1154/1275 character:nebula | character:happy | 1155/1275 character:hank pym | character:hank pym | 1156/1275 character:hank pym | character:ant-man | 1157/1275 character:hank pym | character:thanos | 1158/1275 character:hank pym | character:kate bishop | 1159/1275 character:hank pym | character:kingpin | 1160/1275 character:hank pym | character:moon knight | 1161/1275 character:hank pym | character:arthur harrow | 1162/1275 character:hank pym | character:kamala khan | 1163/1275 character:hank pym | character:she-hulk | 1164/1275 character:hank pym | character:abomination | 1165/1275 character:hank pym | character:odin | 1166/1275 character:hank pym | character:pepper potts | 1167/1275 character:hank pym | character:mj | 1168/1275 character:hank pym | character:ned | 1169/1275 character:hank pym | character:happy | 1170/1275 character:ant-man | character:ant-man | 1171/1275 character:ant-man | character:thanos | 1172/1275 character:ant-man | character:kate bishop | 1173/1275 character:ant-man | character:kingpin | 1174/1275 character:ant-man | character:moon knight | 1175/1275 character:ant-man | character:arthur harrow | 1176/1275 character:ant-man | character:kamala khan | 1177/1275 character:ant-man | character:she-hulk | 1178/1275 character:ant-man | character:abomination | 1179/1275 character:ant-man | character:odin | 1180/1275 character:ant-man | character:pepper potts | 1181/1275 character:ant-man | character:mj | 1182/1275 character:ant-man | character:ned | 1183/1275 character:ant-man | character:happy | 1184/1275 character:thanos | character:thanos | 1185/1275 character:thanos | character:kate bishop | 1186/1275 character:thanos | character:kingpin | 1187/1275 character:thanos | character:moon knight | 1188/1275 character:thanos | character:arthur harrow | 1189/1275 character:thanos | character:kamala khan | 1190/1275 character:thanos | character:she-hulk | 1191/1275 character:thanos | character:abomination | 1192/1275 character:thanos | character:odin | 1193/1275 character:thanos | character:pepper potts | 1194/1275 character:thanos | character:mj | 1195/1275 character:thanos | character:ned | 1196/1275 character:thanos | character:happy | 1197/1275 character:kate bishop | character:kate bishop | 1198/1275 character:kate bishop | character:kingpin | 1199/1275 character:kate bishop | character:moon knight | 1200/1275 character:kate bishop | character:arthur harrow | 1201/1275 character:kate bishop | character:kamala khan | 1202/1275 character:kate bishop | character:she-hulk | 1203/1275 character:kate bishop | character:abomination | 1204/1275 character:kate bishop | character:odin | 1205/1275 character:kate bishop | character:pepper potts | 1206/1275 character:kate bishop | character:mj | 1207/1275 character:kate bishop | character:ned | 1208/1275 character:kate bishop | character:happy | 1209/1275 character:kingpin | character:kingpin | 1210/1275 character:kingpin | character:moon knight | 1211/1275 character:kingpin | character:arthur harrow | 1212/1275 character:kingpin | character:kamala khan | 1213/1275 character:kingpin | character:she-hulk | 1214/1275 character:kingpin | character:abomination | 1215/1275 character:kingpin | character:odin | 1216/1275 character:kingpin | character:pepper potts | 1217/1275 character:kingpin | character:mj | 1218/1275 character:kingpin | character:ned | 1219/1275 character:kingpin | character:happy | 1220/1275 character:moon knight | character:moon knight | 1221/1275 character:moon knight | character:arthur harrow | 1222/1275 character:moon knight | character:kamala khan | 1223/1275 character:moon knight | character:she-hulk | 1224/1275 character:moon knight | character:abomination | 1225/1275 character:moon knight | character:odin | 1226/1275 character:moon knight | character:pepper potts | 1227/1275 character:moon knight | character:mj | 1228/1275 character:moon knight | character:ned | 1229/1275 character:moon knight | character:happy | 1230/1275 character:arthur harrow | character:arthur harrow | 1231/1275 character:arthur harrow | character:kamala khan | 1232/1275 character:arthur harrow | character:she-hulk | 1233/1275 character:arthur harrow | character:abomination | 1234/1275 character:arthur harrow | character:odin | 1235/1275 character:arthur harrow | character:pepper potts | 1236/1275 character:arthur harrow | character:mj | 1237/1275 character:arthur harrow | character:ned | 1238/1275 character:arthur harrow | character:happy | 1239/1275 character:kamala khan | character:kamala khan | 1240/1275 character:kamala khan | character:she-hulk | 1241/1275 character:kamala khan | character:abomination | 1242/1275 character:kamala khan | character:odin | 1243/1275 character:kamala khan | character:pepper potts | 1244/1275 character:kamala khan | character:mj | 1245/1275 character:kamala khan | character:ned | 1246/1275 character:kamala khan | character:happy | 1247/1275 character:she-hulk | character:she-hulk | 1248/1275 character:she-hulk | character:abomination | 1249/1275 character:she-hulk | character:odin | 1250/1275 character:she-hulk | character:pepper potts | 1251/1275 character:she-hulk | character:mj | 1252/1275 character:she-hulk | character:ned | 1253/1275 character:she-hulk | character:happy | 1254/1275 character:abomination | character:abomination | 1255/1275 character:abomination | character:odin | 1256/1275 character:abomination | character:pepper potts | 1257/1275 character:abomination | character:mj | 1258/1275 character:abomination | character:ned | 1259/1275 character:abomination | character:happy | 1260/1275 character:odin | character:odin | 1261/1275 character:odin | character:pepper potts | 1262/1275 character:odin | character:mj | 1263/1275 character:odin | character:ned | 1264/1275 character:odin | character:happy | 1265/1275 character:pepper potts | character:pepper potts | 1266/1275 character:pepper potts | character:mj | 1267/1275 character:pepper potts | character:ned | 1268/1275 character:pepper potts | character:happy | 1269/1275 character:mj | character:mj | 1270/1275 character:mj | character:ned | 1271/1275 character:mj | character:happy | 1272/1275 character:ned | character:ned | 1273/1275 character:ned | character:happy | 1274/1275 character:happy | character:happy | 1275/1275
# percentage of positive comments
plt.figure(figsize=(8, 8))
plt.imshow(char_cooccur_positve_perc)
plt.grid(False)
# character co-occurence count
# char_cooccur_count_normalized = np.zeros(char_cooccur_count.shape)
# for i in range(num_character):
# for j in range(i, num_character):
# if i == j:
# char_cooccur_count_normalized[i, j] = 1
# continue
# char_cooccur_count_normalized[i, j] = char_cooccur_count_normalized[j, i] = char_cooccur_count[i, j] / char_cooccur_count[i, i]
char_cooccur_count_normalized = char_cooccur_count / char_cooccur_count.diagonal()[:,None]
plt.figure(figsize=(8, 8))
plt.imshow(np.tril(char_cooccur_count, -1))
plt.grid(False)
# save to local
csv_root_dir = '/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/'
np.savetxt(csv_root_dir + 'char_cooccur_count.csv', char_cooccur_count, delimiter=',')
np.savetxt(csv_root_dir + 'char_cooccur_positve_perc.csv', char_cooccur_positve_perc, delimiter=',')
np.savetxt(csv_root_dir + 'char_cooccur_score.csv', char_cooccur_score, delimiter=',')
# load from local
csv_root_dir = '/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/'
char_cooccur_count = np.loadtxt(csv_root_dir + 'char_cooccur_count.csv', delimiter=',')
char_cooccur_positve_perc = np.loadtxt(csv_root_dir + 'char_cooccur_positve_perc.csv', delimiter=',')
char_cooccur_score = np.loadtxt(csv_root_dir + 'char_cooccur_score.csv', delimiter=',')
# function that capitialize the character names
def capitalize(char_name):
if ' ' in char_name:
result = ' '.join([x.capitalize() for x in char_name.split(' ')])
elif '-' in char_name:
result = '-'.join([x.capitalize() for x in char_name.split('-')])
elif len(char_name) == 2:
result = char_name.upper()
else:
result = char_name.capitalize()
return result
# write result to txt
result_txt = ''
for i in range(num_character):
for j in range(num_character):
tmp = char_cooccur_positve_perc[i, j]
result_txt += f'[{num_character-i-1}, {j}, {"{:.3f}".format(tmp)}],\n'
with open('result.txt', 'w') as fID:
fID.write(result_txt)
# generate network (tried but failed)
from pyvis.network import Network
# max_coocur_count = np.max(char_cooccur_count[np.triu_indices(num_character)])
net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", neighborhood_highlight=True, filter_menu=True)
characters = [c.split(':')[-1] for c in character_columns]
net.add_nodes(range(num_character), value=char_cooccur_count.diagonal(), label=[capitalize(c) for c in characters])
for i in range(num_character):
for j in range(i, num_character):
if i == j:
continue
net.add_edge(i, j, hidden=False, physics=False, value=char_cooccur_count[i, j])