In [0]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from sparknlp.annotator import *
from sparknlp.base import * 
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from datetime import datetime
import random as rd

Read Reddit Data¶

In [0]:
# Load processed intermediate data from parquet files
sub_com_with_dummy = spark.read.parquet("/FileStore/sub_com/with_dummy/")
In [0]:
sub_com_with_dummy.printSchema()
root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = true)
 |-- character:black widow: boolean (nullable = true)
 |-- character:yelena belova: boolean (nullable = true)
 |-- character:shang-chi: boolean (nullable = true)
 |-- character:sersi: boolean (nullable = true)
 |-- character:ikaris: boolean (nullable = true)
 |-- character:thena: boolean (nullable = true)
 |-- character:ajak: boolean (nullable = true)
 |-- character:spider-man: boolean (nullable = true)
 |-- character:doctor strange: boolean (nullable = true)
 |-- character:electro: boolean (nullable = true)
 |-- character:green goblin: boolean (nullable = true)
 |-- character:doc ock: boolean (nullable = true)
 |-- character:wong: boolean (nullable = true)
 |-- character:wanda: boolean (nullable = true)
 |-- character:thor: boolean (nullable = true)
 |-- character:jane foster: boolean (nullable = true)
 |-- character:gorr: boolean (nullable = true)
 |-- character:vision: boolean (nullable = true)
 |-- character:agnes: boolean (nullable = true)
 |-- character:falcon: boolean (nullable = true)
 |-- character:bucky: boolean (nullable = true)
 |-- character:john walker: boolean (nullable = true)
 |-- character:captain america: boolean (nullable = true)
 |-- character:loki: boolean (nullable = true)
 |-- character:casey: boolean (nullable = true)
 |-- character:the watcher: boolean (nullable = true)
 |-- character:nick fury: boolean (nullable = true)
 |-- character:iron man: boolean (nullable = true)
 |-- character:hawkeye: boolean (nullable = true)
 |-- character:ultron: boolean (nullable = true)
 |-- character:red skull: boolean (nullable = true)
 |-- character:captain marvel: boolean (nullable = true)
 |-- character:captain carter: boolean (nullable = true)
 |-- character:hulk: boolean (nullable = true)
 |-- character:nebula: boolean (nullable = true)
 |-- character:hank pym: boolean (nullable = true)
 |-- character:ant-man: boolean (nullable = true)
 |-- character:thanos: boolean (nullable = true)
 |-- character:kate bishop: boolean (nullable = true)
 |-- character:kingpin: boolean (nullable = true)
 |-- character:moon knight: boolean (nullable = true)
 |-- character:arthur harrow: boolean (nullable = true)
 |-- character:kamala khan: boolean (nullable = true)
 |-- character:she-hulk: boolean (nullable = true)
 |-- character:abomination: boolean (nullable = true)
 |-- character:odin: boolean (nullable = true)
 |-- character:pepper potts: boolean (nullable = true)
 |-- character:mj: boolean (nullable = true)
 |-- character:ned: boolean (nullable = true)
 |-- character:happy: boolean (nullable = true)
 |-- movie:black widow: boolean (nullable = true)
 |-- movie:shang-chi: boolean (nullable = true)
 |-- movie:eternals: boolean (nullable = true)
 |-- movie:spider-man: boolean (nullable = true)
 |-- movie:doctor strange: boolean (nullable = true)
 |-- movie:thor: boolean (nullable = true)
 |-- series:wandavision: boolean (nullable = true)
 |-- series:the falcon and the winter soldier: boolean (nullable = true)
 |-- series:loki: boolean (nullable = true)
 |-- series:what if: boolean (nullable = true)
 |-- series:hawkeye: boolean (nullable = true)
 |-- series:moon knight: boolean (nullable = true)
 |-- series:ms marvel: boolean (nullable = true)
 |-- series:she-hulk: boolean (nullable = true)

Clean the Text Data¶

Create Text Cleanning Pipeline¶

In [0]:
# cleaning pipeline
document_assembler = DocumentAssembler() \
.setInputCol("content") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

normalizer= Normalizer()\
    .setInputCols(["token"])\
    .setOutputCol("normalized")\
    .setCleanupPatterns(["[^\w\d\s]"])

stop_words = StopWordsCleaner.pretrained("stopwords_en", "en") \
.setInputCols(["normalized"]) \
.setOutputCol("clean_normalized")

lemmatizer = LemmatizerModel.pretrained() \
.setInputCols(["clean_normalized"]) \
.setOutputCol("lemma")

token_assembler = TokenAssembler()\
.setInputCols(["document", "lemma"])\
.setOutputCol("assembled")

clean_pipeline = Pipeline(stages=[document_assembler, tokenizer, normalizer, stop_words, lemmatizer, token_assembler])
stopwords_en download started this may take some time.
Approximate size to download 2.9 KB

[ | ]
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB

[ | ]
[OK!]
In [0]:
cleaned_df = clean_pipeline.fit(sub_com_with_dummy).transform(sub_com_with_dummy)
In [0]:
cleaned_df.select(col('assembled.result')).show(10, truncate=False)
+--------------------------------------------------------------------------------------------+
|result                                                                                      |
+--------------------------------------------------------------------------------------------+
|[white wolf]                                                                                |
|[dont race]                                                                                 |
|[huh lowkey salty]                                                                          |
|[decent guy unfairly treat sambucky]                                                        |
|[hey im bromo humor change tone episode episode general feel goofy]                         |
|[battlestar join bucky sam stop walker deep end care friend what]                           |
|[racist micro aggression place steve arrogance think mantel sam publicly give shield museum]|
|[agatha nosy neighbor 350]                                                                  |
|[story make knot hardcore fan half think]                                                   |
|[nice 5he character breath experience wouldnt 2 hr movie]                                   |
+--------------------------------------------------------------------------------------------+
only showing top 10 rows

Build Sentiment Model¶

In [0]:
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["assembled"])\
 .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLModel.pretrained(name="sentimentdl_use_twitter", lang="en")\
.setInputCols(["sentence_embeddings"])\
.setOutputCol("sentiment")

sentiment_pipeline = Pipeline(stages=[use, sentimentdl])
tfhub_use download started this may take some time.
Approximate size to download 923.7 MB

[ | ]
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB

[ | ]
[OK!]
In [0]:
sentiment_df = sentiment_pipeline.fit(cleaned_df).transform(cleaned_df)
In [0]:
sentiment_df.select(col('assembled.result').getItem(0).alias("content"), col("sentiment.result").getItem(0).alias("sentiment")).show()
+--------------------+---------+
|             content|sentiment|
+--------------------+---------+
|          white wolf| positive|
|           dont race| negative|
|    huh lowkey salty| negative|
|decent guy unfair...|  neutral|
|hey im bromo humo...| positive|
|battlestar join b...| negative|
|racist micro aggr...| positive|
|agatha nosy neigh...| negative|
|story make knot h...| positive|
|nice 5he characte...| positive|
|abouthttpsyoutube...| positive|
|feel walker cap o...| positive|
|                wait| positive|
|bucky fall helica...| negative|
|love shit talk sa...| positive|
|      username check| positive|
|yea kind assume s...| positive|
|uh sense leader t...| negative|
|              ao mcu| positive|
|episode make laug...| positive|
+--------------------+---------+
only showing top 20 rows

In [0]:
sentiment_df.printSchema()
root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = true)
 |-- character:black widow: boolean (nullable = true)
 |-- character:yelena belova: boolean (nullable = true)
 |-- character:shang-chi: boolean (nullable = true)
 |-- character:sersi: boolean (nullable = true)
 |-- character:ikaris: boolean (nullable = true)
 |-- character:thena: boolean (nullable = true)
 |-- character:ajak: boolean (nullable = true)
 |-- character:spider-man: boolean (nullable = true)
 |-- character:doctor strange: boolean (nullable = true)
 |-- character:electro: boolean (nullable = true)
 |-- character:green goblin: boolean (nullable = true)
 |-- character:doc ock: boolean (nullable = true)
 |-- character:wong: boolean (nullable = true)
 |-- character:wanda: boolean (nullable = true)
 |-- character:thor: boolean (nullable = true)
 |-- character:jane foster: boolean (nullable = true)
 |-- character:gorr: boolean (nullable = true)
 |-- character:vision: boolean (nullable = true)
 |-- character:agnes: boolean (nullable = true)
 |-- character:falcon: boolean (nullable = true)
 |-- character:bucky: boolean (nullable = true)
 |-- character:john walker: boolean (nullable = true)
 |-- character:captain america: boolean (nullable = true)
 |-- character:loki: boolean (nullable = true)
 |-- character:casey: boolean (nullable = true)
 |-- character:the watcher: boolean (nullable = true)
 |-- character:nick fury: boolean (nullable = true)
 |-- character:iron man: boolean (nullable = true)
 |-- character:hawkeye: boolean (nullable = true)
 |-- character:ultron: boolean (nullable = true)
 |-- character:red skull: boolean (nullable = true)
 |-- character:captain marvel: boolean (nullable = true)
 |-- character:captain carter: boolean (nullable = true)
 |-- character:hulk: boolean (nullable = true)
 |-- character:nebula: boolean (nullable = true)
 |-- character:hank pym: boolean (nullable = true)
 |-- character:ant-man: boolean (nullable = true)
 |-- character:thanos: boolean (nullable = true)
 |-- character:kate bishop: boolean (nullable = true)
 |-- character:kingpin: boolean (nullable = true)
 |-- character:moon knight: boolean (nullable = true)
 |-- character:arthur harrow: boolean (nullable = true)
 |-- character:kamala khan: boolean (nullable = true)
 |-- character:she-hulk: boolean (nullable = true)
 |-- character:abomination: boolean (nullable = true)
 |-- character:odin: boolean (nullable = true)
 |-- character:pepper potts: boolean (nullable = true)
 |-- character:mj: boolean (nullable = true)
 |-- character:ned: boolean (nullable = true)
 |-- character:happy: boolean (nullable = true)
 |-- movie:black widow: boolean (nullable = true)
 |-- movie:shang-chi: boolean (nullable = true)
 |-- movie:eternals: boolean (nullable = true)
 |-- movie:spider-man: boolean (nullable = true)
 |-- movie:doctor strange: boolean (nullable = true)
 |-- movie:thor: boolean (nullable = true)
 |-- series:wandavision: boolean (nullable = true)
 |-- series:the falcon and the winter soldier: boolean (nullable = true)
 |-- series:loki: boolean (nullable = true)
 |-- series:what if: boolean (nullable = true)
 |-- series:hawkeye: boolean (nullable = true)
 |-- series:moon knight: boolean (nullable = true)
 |-- series:ms marvel: boolean (nullable = true)
 |-- series:she-hulk: boolean (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- normalized: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- clean_normalized: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- lemma: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- assembled: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence_embeddings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentiment: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)

Create Cleaned Dataframe¶

In [0]:
# sub_com_cleaned = sentiment_df.select()
dummy_colnames = [c for c in sentiment_df.columns if c.startswith("character:") or c.startswith("movie:") or c.startswith("series:")]
sub_com_nlp = sentiment_df.select("id", "author", "created_ts", col('assembled.result').getItem(0).alias("content"), "score", "is_submission", col("sentiment.result").getItem(0).alias("sentiment"), *dummy_colnames)
In [0]:
sub_com_nlp.printSchema()
root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- character:black widow: boolean (nullable = true)
 |-- character:yelena belova: boolean (nullable = true)
 |-- character:shang-chi: boolean (nullable = true)
 |-- character:sersi: boolean (nullable = true)
 |-- character:ikaris: boolean (nullable = true)
 |-- character:thena: boolean (nullable = true)
 |-- character:ajak: boolean (nullable = true)
 |-- character:spider-man: boolean (nullable = true)
 |-- character:doctor strange: boolean (nullable = true)
 |-- character:electro: boolean (nullable = true)
 |-- character:green goblin: boolean (nullable = true)
 |-- character:doc ock: boolean (nullable = true)
 |-- character:wong: boolean (nullable = true)
 |-- character:wanda: boolean (nullable = true)
 |-- character:thor: boolean (nullable = true)
 |-- character:jane foster: boolean (nullable = true)
 |-- character:gorr: boolean (nullable = true)
 |-- character:vision: boolean (nullable = true)
 |-- character:agnes: boolean (nullable = true)
 |-- character:falcon: boolean (nullable = true)
 |-- character:bucky: boolean (nullable = true)
 |-- character:john walker: boolean (nullable = true)
 |-- character:captain america: boolean (nullable = true)
 |-- character:loki: boolean (nullable = true)
 |-- character:casey: boolean (nullable = true)
 |-- character:the watcher: boolean (nullable = true)
 |-- character:nick fury: boolean (nullable = true)
 |-- character:iron man: boolean (nullable = true)
 |-- character:hawkeye: boolean (nullable = true)
 |-- character:ultron: boolean (nullable = true)
 |-- character:red skull: boolean (nullable = true)
 |-- character:captain marvel: boolean (nullable = true)
 |-- character:captain carter: boolean (nullable = true)
 |-- character:hulk: boolean (nullable = true)
 |-- character:nebula: boolean (nullable = true)
 |-- character:hank pym: boolean (nullable = true)
 |-- character:ant-man: boolean (nullable = true)
 |-- character:thanos: boolean (nullable = true)
 |-- character:kate bishop: boolean (nullable = true)
 |-- character:kingpin: boolean (nullable = true)
 |-- character:moon knight: boolean (nullable = true)
 |-- character:arthur harrow: boolean (nullable = true)
 |-- character:kamala khan: boolean (nullable = true)
 |-- character:she-hulk: boolean (nullable = true)
 |-- character:abomination: boolean (nullable = true)
 |-- character:odin: boolean (nullable = true)
 |-- character:pepper potts: boolean (nullable = true)
 |-- character:mj: boolean (nullable = true)
 |-- character:ned: boolean (nullable = true)
 |-- character:happy: boolean (nullable = true)
 |-- movie:black widow: boolean (nullable = true)
 |-- movie:shang-chi: boolean (nullable = true)
 |-- movie:eternals: boolean (nullable = true)
 |-- movie:spider-man: boolean (nullable = true)
 |-- movie:doctor strange: boolean (nullable = true)
 |-- movie:thor: boolean (nullable = true)
 |-- series:wandavision: boolean (nullable = true)
 |-- series:the falcon and the winter soldier: boolean (nullable = true)
 |-- series:loki: boolean (nullable = true)
 |-- series:what if: boolean (nullable = true)
 |-- series:hawkeye: boolean (nullable = true)
 |-- series:moon knight: boolean (nullable = true)
 |-- series:ms marvel: boolean (nullable = true)
 |-- series:she-hulk: boolean (nullable = true)

Save Output into DBFS¶

In [0]:
# Save processed nlp dataframe into parquet files
sub_com_nlp.write.parquet("/FileStore/sub_com/nlp/")
In [0]:
# Load processed nlp dataframe from parquet files
sub_com_nlp = spark.read.parquet("/FileStore/sub_com/nlp/")

Load External Dataset¶

In [0]:
# Load external data
#Load imdb movie data
imdb = pd.read_csv("/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/imdb_rating.csv")
df_char = spark.read.parquet("/FileStore/sub_com/character_count/")
df_media = spark.read.parquet("/FileStore/sub_com/media_count/")
In [0]:
media_timeline_pd = df_media.toPandas()

Conduct the natural language processing work¶

The top 10 most common words overall¶

In [0]:
#calculate the top 10 most common words in content column
common_words = sub_com_nlp.withColumn('words', explode(split('content', ' '))) \
                   .groupBy('words') \
                   .count() \
                   .orderBy(desc('count')) \
                   .limit(10)
In [0]:
common_words.show()
+---------+------+
|    words| count|
+---------+------+
|    movie|728717|
|     make|594576|
|     show|491784|
|     dont|488970|
|character|440987|
|   people|439364|
|       im|435627|
|     time|432575|
|      mcu|426524|
|   marvel|384753|
+---------+------+

The distribution of text lengths¶

In [0]:
#calculate the length of each content
text_length = sub_com_nlp.select(col('content'), size(split(col('content'), ' ')).alias('word_count'))
In [0]:
text_length.show()
+--------------------+----------+
|             content|word_count|
+--------------------+----------+
|          white wolf|         2|
|           dont race|         2|
|    huh lowkey salty|         3|
|decent guy unfair...|         5|
|hey im bromo humo...|        11|
|battlestar join b...|        11|
|racist micro aggr...|        13|
|agatha nosy neigh...|         4|
|story make knot h...|         7|
|nice 5he characte...|         9|
|abouthttpsyoutube...|         6|
|feel walker cap o...|         9|
|                wait|         1|
|bucky fall helica...|         9|
|love shit talk sa...|         7|
|      username check|         2|
|yea kind assume s...|        17|
|uh sense leader t...|         6|
|              ao mcu|         2|
|episode make laug...|         4|
+--------------------+----------+
only showing top 20 rows

In [0]:
length_pd = text_length.toPandas()
In [0]:
#distribution
length_distribution = text_length.select(col('word_count')).groupby('word_count').count().orderBy(asc('word_count'))
distribution_pd = length_distribution.toPandas()
distribution_pd
word_count count
0 1 736297
1 2 450477
2 3 453257
3 4 416889
4 5 369788
... ... ...
862 3156 1
863 3189 1
864 3453 2
865 3565 2
866 3836 1

867 rows × 2 columns

In [0]:
# create a funtion to bin the text length
def bin_text(dat, max_len = 50, min_len = 0, binsize = 1):
  """
  bin the text length into groups
  max_len: the maximum of the text length
  binsize: the size of each bin
  dat: pd.DataFrame, with column word_count
  """
  
  bin_ls = list(range(min_len, max_len + binsize, binsize)) + [np.max(dat['word_count'])]
  bin_labels = [f'{i}-{i+binsize}' for i in range(min_len, max_len, binsize)] + [f'>{max_len}']

  dat['group'] = pd.cut(dat['word_count'], bins = bin_ls, labels = bin_labels)
  
  return dat
In [0]:
# change if you like
max_len, min_len = 50, 0
binsize = 2

binned_df = bin_text(distribution_pd, max_len, min_len, binsize)
binned_df = binned_df.groupby('group').apply(lambda df: df['count'].sum()).to_frame('count').reset_index()
In [0]:
# distribution plot
fig, ax = plt.subplots(figsize = (18,10), dpi = 300)

sns.set_theme(style = 'white')
ax = sns.barplot(x = binned_df['group'], y = binned_df['count'], alpha = 0.7, color = '#fcbf49')
for container in ax.containers:
  ax.bar_label(container, fontsize = 8)
  
plt.rcParams['font.family'] = 'P052'
ax.set_title('Text Length Distribution', fontsize = 20, y = 1.03)
ax.set_ylabel('Count', fontsize = 14)
ax.set_xlabel('Text Length', fontsize = 14)
plt.savefig('text_len_dist.png')

Important words according to TF-IDF¶

In [0]:
from pyspark.ml.feature import HashingTF as MLHashingTF
from pyspark.ml.feature import IDF as MLIDF
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import udf,lit
In [0]:
#extract content information
content_df = sub_com_nlp.select(col('content'))
content_df = content_df.withColumn("text_id", monotonically_increasing_id())
In [0]:
#bag-of-words
bow_df = (content_df.rdd\
  .map(lambda x : (x.text_id,x.content.split(" ")))\
  .toDF()\
  .withColumnRenamed("_1","text_id")\
  .withColumnRenamed("_2","features"))
In [0]:
#TF
htf = MLHashingTF(inputCol="features", outputCol="tf")
tf = htf.transform(bow_df)
tf.show()
+-------+--------------------+--------------------+
|text_id|            features|                  tf|
+-------+--------------------+--------------------+
|      0|       [white, wolf]|(262144,[75571,21...|
|      1|        [dont, race]|(262144,[87273,22...|
|      2|[huh, lowkey, salty]|(262144,[180689,2...|
|      3|[decent, guy, unf...|(262144,[34611,16...|
|      4|[hey, im, bromo, ...|(262144,[31015,61...|
|      5|[battlestar, join...|(262144,[54502,74...|
|      6|[racist, micro, a...|(262144,[5481,395...|
|      7|[agatha, nosy, ne...|(262144,[133662,1...|
|      8|[story, make, kno...|(262144,[17252,77...|
|      9|[nice, 5he, chara...|(262144,[12524,22...|
|     10|[abouthttpsyoutub...|(262144,[5451,500...|
|     11|[feel, walker, ca...|(262144,[30796,54...|
|     12|              [wait]|(262144,[150069],...|
|     13|[bucky, fall, hel...|(262144,[37521,42...|
|     14|[love, shit, talk...|(262144,[54502,64...|
|     15|   [username, check]|(262144,[23032,20...|
|     16|[yea, kind, assum...|(262144,[8443,488...|
|     17|[uh, sense, leade...|(262144,[29129,11...|
|     18|           [ao, mcu]|(262144,[170147,2...|
|     19|[episode, make, l...|(262144,[2437,897...|
+-------+--------------------+--------------------+
only showing top 20 rows

In [0]:
#IDF
idf = MLIDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(5, truncate=False)
+-------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text_id|features                                                                     |tf                                                                                                                   |idf                                                                                                                                                                                                                                                              |
+-------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0      |[white, wolf]                                                                |(262144,[75571,211527],[1.0,1.0])                                                                                    |(262144,[75571,211527],[4.969006641728334,7.480357711105211])                                                                                                                                                                                                    |
|1      |[dont, race]                                                                 |(262144,[87273,227686],[1.0,1.0])                                                                                    |(262144,[87273,227686],[2.5815101421520965,6.076172362698935])                                                                                                                                                                                                   |
|2      |[huh, lowkey, salty]                                                         |(262144,[180689,230868,246953],[1.0,1.0,1.0])                                                                        |(262144,[180689,230868,246953],[8.203474561356371,6.579460854136553,6.474726267612691])                                                                                                                                                                          |
|3      |[decent, guy, unfairly, treat, sambucky]                                     |(262144,[34611,161102,163240,257513,259390],[1.0,1.0,1.0,1.0,1.0])                                                   |(262144,[34611,161102,163240,257513,259390],[10.007210221631142,3.8406961442001126,6.27442918245799,9.503639768746323,5.953506389680384])                                                                                                                        |
|4      |[hey, im, bromo, humor, change, tone, episode, episode, general, feel, goofy]|(262144,[31015,61756,61899,62133,75898,109557,113241,140461,193181,223059],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0])|(262144,[31015,61756,61899,62133,75898,109557,113241,140461,193181,223059],[2.695232909381837,5.694015111736679,3.174706602237098,6.472381050777942,5.878263772926869,13.32743854075963,5.348746734570755,7.264986649321078,7.043097104322763,4.279695532729672])|
+-------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 5 rows

In [0]:
#create an array column for tfidf value for future analysis
list_ = udf(lambda v: v.values.tolist(), ArrayType(DoubleType()))
tfidf = tfidf.withColumn("idf_list", list_("idf"))
In [0]:
#index of the max value of each list
tfidf = tfidf.withColumn('max_value_index',expr("array_position(idf_list,array_max(idf_list))-1"))
In [0]:
tfidf.show()
+-------+--------------------+--------------------+--------------------+--------------------+---------------+
|text_id|            features|                  tf|                 idf|            idf_list|max_value_index|
+-------+--------------------+--------------------+--------------------+--------------------+---------------+
|      0|       [white, wolf]|(262144,[75571,21...|(262144,[75571,21...|[4.96900664172833...|              1|
|      1|        [dont, race]|(262144,[87273,22...|(262144,[87273,22...|[2.58151014215209...|              1|
|      2|[huh, lowkey, salty]|(262144,[180689,2...|(262144,[180689,2...|[8.20347456135637...|              0|
|      3|[decent, guy, unf...|(262144,[34611,16...|(262144,[34611,16...|[10.0072102216311...|              0|
|      4|[hey, im, bromo, ...|(262144,[31015,61...|(262144,[31015,61...|[2.69523290938183...|              5|
|      5|[battlestar, join...|(262144,[54502,74...|(262144,[54502,74...|[4.68169578991286...|              3|
|      6|[racist, micro, a...|(262144,[5481,395...|(262144,[5481,395...|[9.86867130727599...|              0|
|      7|[agatha, nosy, ne...|(262144,[133662,1...|(262144,[133662,1...|[10.3541791230576...|              0|
|      8|[story, make, kno...|(262144,[17252,77...|(262144,[17252,77...|[4.88724639127954...|              6|
|      9|[nice, 5he, chara...|(262144,[12524,22...|(262144,[12524,22...|[3.63985690562423...|              4|
|     10|[abouthttpsyoutub...|(262144,[5451,500...|(262144,[5451,500...|[4.45493578155102...|              5|
|     11|[feel, walker, ca...|(262144,[30796,54...|(262144,[30796,54...|[4.27765823379588...|              5|
|     12|              [wait]|(262144,[150069],...|(262144,[150069],...| [4.417557824261202]|              0|
|     13|[bucky, fall, hel...|(262144,[37521,42...|(262144,[37521,42...|[10.2431655894537...|              0|
|     14|[love, shit, talk...|(262144,[54502,64...|(262144,[54502,64...|[4.68169578991286...|              0|
|     15|   [username, check]|(262144,[23032,20...|(262144,[23032,20...|[8.07807801893812...|              0|
|     16|[yea, kind, assum...|(262144,[8443,488...|(262144,[8443,488...|[5.97144251320312...|             10|
|     17|[uh, sense, leade...|(262144,[29129,11...|(262144,[29129,11...|[7.35532672216746...|              0|
|     18|           [ao, mcu]|(262144,[170147,2...|(262144,[170147,2...|[5.74260900584325...|              0|
|     19|[episode, make, l...|(262144,[2437,897...|(262144,[2437,897...|[4.49151690848628...|              2|
+-------+--------------------+--------------------+--------------------+--------------------+---------------+
only showing top 20 rows

In [0]:
tfidf_max.show(truncate=False)
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+---------------+
|text_id|idf_list                                                                                                                                                                                                                                                                                                                             |max_value         |features                                                                                                                  |max_value_index|
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+---------------+
|0      |[4.969006641728334, 7.480357711105211]                                                                                                                                                                                                                                                                                               |7.480357711105211 |[white, wolf]                                                                                                             |1              |
|1      |[2.5815101421520965, 6.076172362698935]                                                                                                                                                                                                                                                                                              |6.076172362698935 |[dont, race]                                                                                                              |1              |
|2      |[8.203474561356371, 6.579460854136553, 6.474726267612691]                                                                                                                                                                                                                                                                            |8.203474561356371 |[huh, lowkey, salty]                                                                                                      |0              |
|3      |[10.007210221631142, 3.8406961442001126, 6.27442918245799, 9.503639768746323, 5.953506389680384]                                                                                                                                                                                                                                     |10.007210221631142|[decent, guy, unfairly, treat, sambucky]                                                                                  |0              |
|4      |[2.695232909381837, 5.694015111736679, 3.174706602237098, 6.472381050777942, 5.878263772926869, 13.32743854075963, 5.348746734570755, 7.264986649321078, 7.043097104322763, 4.279695532729672]                                                                                                                                       |13.32743854075963 |[hey, im, bromo, humor, change, tone, episode, episode, general, feel, goofy]                                             |5              |
|5      |[4.681695789912868, 4.747875845490661, 4.889577198625523, 8.294824339944599, 5.899831826804189, 4.550397067821048, 3.413126452140354, 4.6725531803223035, 4.472995637256793, 5.9477370760724195, 5.303449371759564]                                                                                                                  |8.294824339944599 |[battlestar, join, bucky, sam, stop, walker, deep, end, care, friend, what]                                               |3              |
|6      |[9.868671307275998, 7.800216472462312, 4.3922516454728715, 3.767568693005586, 7.856569409013444, 3.398315730631671, 6.204213523439637, 4.6725531803223035, 9.525726556149166, 7.932296989576053, 9.458555027992102, 4.622457279712255, 4.565035938569167]                                                                            |9.868671307275998 |[racist, micro, aggression, place, steve, arrogance, think, mantel, sam, publicly, give, shield, museum]                  |0              |
|7      |[10.354179123057698, 5.428161379227023, 10.267167746068068, 8.127714515474835]                                                                                                                                                                                                                                                       |10.354179123057698|[agatha, nosy, neighbor, 350]                                                                                             |0              |
|8      |[4.887246391279542, 7.862665559193956, 2.4379144831788278, 4.097360903100415, 3.690016538450417, 3.398315730631671, 9.897042004405213]                                                                                                                                                                                               |9.897042004405213 |[story, make, knot, hardcore, fan, half, think]                                                                           |6              |
|9      |[3.639856905624234, 4.894496901790983, 5.148862106027095, 7.494579023828287, 12.480140680372427, 4.052799259683881, 8.67136402490977, 2.3627809950312626, 2.8357724248149934]                                                                                                                                                        |12.480140680372427|[nice, 5he, character, breath, experience, wouldnt, 2, hr, movie]                                                         |4              |
|10     |[4.4549357815510255, 5.103513364534742, 4.172199875566682, 3.7275065283500983, 4.6725531803223035, 14.138368756975959]                                                                                                                                                                                                               |14.138368756975959|[abouthttpsyoutubemoye_l80d4y, kid, call, sam, black, falcon]                                                             |5              |
|11     |[4.277658233795889, 3.2983124680830587, 3.174706602237098, 3.3604755319391955, 3.0904195795374405, 6.245916713455606, 4.964381214465575, 4.464781908979701, 5.303449371759564]                                                                                                                                                       |6.245916713455606 |[feel, walker, cap, opposite, hes, perfect, soldier, good, man]                                                           |5              |
|12     |[4.417557824261202]                                                                                                                                                                                                                                                                                                                  |4.417557824261202 |[wait]                                                                                                                    |0              |
|13     |[10.243165589453717, 7.24903206351784, 4.681695789912868, 4.598508833806416, 4.474805391370244, 7.679638700854365, 4.622457279712255, 7.943453291899503]                                                                                                                                                                             |10.243165589453717|[bucky, fall, helicarrier, rescue, steve, w, high, fall, know]                                                            |0              |
|14     |[4.681695789912868, 4.194351252559876, 4.049792645994886, 2.4379144831788278, 4.6725531803223035, 3.232880775727519, 2.702116474577111]                                                                                                                                                                                              |4.681695789912868 |[love, shit, talk, sam, bucky, show, make]                                                                                |0              |
|15     |[8.078078018938124, 4.715157416456306]                                                                                                                                                                                                                                                                                               |8.078078018938124 |[username, check]                                                                                                         |0              |
|16     |[5.971442513203123, 5.107095244073235, 3.2983124680830587, 9.288293527892145, 4.397458959197189, 4.274284934339655, 5.196888703210969, 5.6581508016846165, 6.41215610646843, 5.1277606320535245, 11.167954291406257, 6.064264662264575, 3.43943130351661, 8.427941739601088, 4.798316778478119, 6.513505659989033, 4.565035938569167]|11.167954291406257|[yea, kind, assume, scene, hes, throw, shield, hit, target, lake, gma, vibranium, super, light, versatile, material, easy]|10             |
|17     |[7.3553267221674625, 4.322497309695291, 6.253886377315836, 3.353223326060119, 4.087226578356274, 7.096082585036216]                                                                                                                                                                                                                  |7.3553267221674625|[uh, sense, leader, tomorrow, back, line]                                                                                 |0              |
|18     |[5.742609005843253, 2.781319626585721]                                                                                                                                                                                                                                                                                               |5.742609005843253 |[ao, mcu]                                                                                                                 |0              |
|19     |[4.491516908486286, 2.4379144831788278, 5.737372359962233, 3.632493324660539]                                                                                                                                                                                                                                                        |5.737372359962233 |[episode, make, laugh, hard]                                                                                              |2              |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------+---------------+
only showing top 20 rows

In [0]:
#get the max value of each idf_list
tfidf_max = tfidf.select(col('text_id'), col('idf_list'), array_max(tfidf.idf_list).alias('max_value'), col('features'), col('max_value_index'))
In [0]:
#get the max idf value of all words and its index in the text
tfidf_max.createOrReplaceTempView("df")
spark.sql("SELECT * FROM df order by max_value desc limit 1").show()
+-------+--------------------+------------------+--------------------+---------------+
|text_id|            idf_list|         max_value|            features|max_value_index|
+-------+--------------------+------------------+--------------------+---------------+
| 511587|[13964.774292624299]|13964.774292624299|[heck, heck, heck...|              0|
+-------+--------------------+------------------+--------------------+---------------+

According to TF-IDF, the most important word is 'heck'.

Media NLP & Visualization¶

1. Determine the top 5 popular Marvel movie or Disney+ Series in 2021-2022 Aug¶

In [0]:
media_info = spark.read.parquet("/FileStore/sub_com/media_count/")
media_info_pd_orginal = media_info.toPandas()
media_info_pd = media_info_pd_orginal.sort_values(by=['mention_count'], ascending=False)
media_info_pd = media_info_pd.reset_index(drop=True)
In [0]:
media_info_pd
media_name release_date is_movie box_office (USD) investment (USD) mention_count
0 spider-man 2021-12-17 True 1.916307e+09 200000000 136529
1 loki 2021-06-09 False NaN 225000000 133711
2 thor 2022-07-08 True 7.607553e+08 250000000 132231
3 wandavision 2021-01-15 False NaN 200000000 59422
4 eternals 2021-11-05 True 4.020649e+08 200000000 49710
5 Hawkeye 2021-11-24 False NaN 150000000 45844
6 black widow 2021-07-09 True 3.797517e+08 200000000 41987
7 what if 2021-08-11 False NaN 150000000 40495
8 doctor strange 2022-05-06 True 9.557758e+08 200000000 31266
9 moon knight 2022-03-30 False NaN 150000000 25062
10 shang-chi 2021-09-03 True 4.322433e+08 150000000 23290
11 ms marvel 2022-06-08 False NaN 150000000 20818
12 she-hulk 2022-08-17 False NaN 225000000 19844
13 the falcon and the winter soldier 2021-03-19 False NaN 150000000 2479
In [0]:
#Determine the top 5 movies or series to answer the "popular" insights for media class
media_info_pd[['media_name', 'mention_count']].head(5)
media_name mention_count
0 spider-man 136529
1 loki 133711
2 thor 132231
3 wandavision 59422
4 eternals 49710
In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(26, 12))
p1 = sns.barplot(x="media_name", y="mention_count", data=media_info_pd,dodge=False, color = "#fcbf49", alpha = 0.7)
for container in ax.containers:
  ax.bar_label(container, fontsize = 10)
ax.tick_params(axis='x', rotation=45)
p1.set_title("The Popularity of Media in 2021-2022 Aug", fontsize='20')
p1.set_xlabel("Media Name", fontsize = "20")
p1.set_ylabel("Count", fontsize = "20")
Out[93]: Text(0, 0.5, 'Count')

2. Catch the correlation between imdb media ratings, box office, investment and audience reviews¶

In [0]:
#retrive movie and seires
media_columns = [x for x in sub_com_nlp.columns if (x.startswith('movie:') or x.startswith('series:'))]
In [0]:
import pandas as pd
media_stats = {'media_name': [], 'type': [], 'count': [], 'score': [], 'positive': []}

for media in media_columns:
    
    t, m = media.split(':')
    media_stats['type'].append(t)
    media_stats['media_name'].append(m)
    
    df_temp = sub_com_nlp.filter((col(media)))
    
    count = df_temp.count()
    media_stats['count'].append(count)
    
    sentiment_summary = df_temp.groupby('sentiment').count()
    positive_perc = sentiment_summary.withColumn('perc', (col('count') / count)).filter(col('sentiment')=='positive').collect()[0]['perc']
    media_stats['positive'].append(positive_perc)
    
    score_avg = df_temp.agg({'score': 'avg'}).collect()[0]['avg(score)']
    media_stats['score'].append(score_avg)
In [0]:
media_stats_pd = pd.DataFrame(media_stats)
In [0]:
corr_df = pd.merge(media_stats_pd, imdb, on = "media_name")
In [0]:
sort_movie = media_info_pd_orginal.sort_values('media_name', ascending=False)
sort_movie = sort_movie.reset_index(drop=True)
sort_corr = corr_df.sort_values('media_name', ascending=False)
sort_corr = sort_corr.reset_index(drop=True)
In [0]:
final_corr = pd.merge(sort_corr, sort_movie, on = "media_name")
In [0]:
final_corr = final_corr.drop(columns = ["is_movie", "release_date", "mention_count"])
In [0]:
def hide_current_axis(*args, **kwds):
    plt.gca().set_visible(False)
    
sns.set_style("whitegrid", {'axes.grid' : False})
sns.set(font_scale = 1.1)
g = sns.pairplot(final_corr, hue ="type", markers=["o", "s"], palette = "summer")
g.map_lower(sns.regplot, ci=None)
g.map_upper(hide_current_axis)

(xmin, _), (_, ymax) = g.axes[0, 0].get_position().get_points()
(_, ymin), (xmax, _) = g.axes[-1, -1].get_position().get_points()

ax = g.fig.add_axes([xmin, ymin, xmax - xmin, ymax - ymin], facecolor='none')

corr1 = final_corr[["count", "score", "positive","imdb_rating","box_office (USD)","investment (USD)"]].corr()
mask1 = np.tril(np.ones_like(corr1, dtype=bool))
sns.heatmap(corr1, mask=mask1, vmax=.5, vmin=-.5,
            linewidths=.3, cmap="summer", cbar=False, annot=True, annot_kws={'size': 15}, ax=ax)

ax.set_title("Catch the correlation between imdb media ratings, box office, investment and audience reviews", fontsize = "15")
ax.set_xticks([])
ax.set_yticks([])
Out[152]: []

3. Observe the fluctuation of heat of discussion/sentiment/reviews towards medias.¶

In [0]:
# select the columns for media analysis
media_ls = [i for i in sub_com_nlp.columns if i[:5] == 'movie' or i[:6] == 'series']
media_col_ls = ['created_ts', 'score', 'sentiment'] + media_ls
media_nlp = sub_com_nlp[media_col_ls]
# extract the date from created_ts
media_nlp = media_nlp.withColumn('date', to_date('created_ts'))
# cast the data type of the media columns to integer
for col_name in media_ls:
    media_nlp = media_nlp.withColumn(col_name, col(col_name).cast('int'))
In [0]:
media_nlp.printSchema()
root
 |-- created_ts: timestamp (nullable = true)
 |-- score: integer (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- movie:black widow: integer (nullable = true)
 |-- movie:shang-chi: integer (nullable = true)
 |-- movie:eternals: integer (nullable = true)
 |-- movie:spider-man: integer (nullable = true)
 |-- movie:doctor strange: integer (nullable = true)
 |-- movie:thor: integer (nullable = true)
 |-- series:wandavision: integer (nullable = true)
 |-- series:the falcon and the winter soldier: integer (nullable = true)
 |-- series:loki: integer (nullable = true)
 |-- series:what if: integer (nullable = true)
 |-- series:hawkeye: integer (nullable = true)
 |-- series:moon knight: integer (nullable = true)
 |-- series:ms marvel: integer (nullable = true)
 |-- series:she-hulk: integer (nullable = true)
 |-- date: date (nullable = true)

In [0]:
#define a function to analysis the discussion heat of medias based on score and sentimental analysis
def media_heat(dat, score_weight = 0, sentiment_weight = [1,1,1]):
    """
    calculate the discussion heat of characters, return a pd.dataframe
    dat: pyspark df, have to include date and sentiment column, timestamp is stripped from date
    score_weight: >= 0, if set > 0, will give score a weight and assign this weight together with score to each comment, default is 0
    sentiment_weight: list, contains weight on the sentiment for positive, neutral, negative; 1,1,1 will add up all the comments
    """
    if score_weight != 0:
        for col_name in media_ls:
            dat = dat.withColumn(col_name, col(col_name)*col(score)*score_weight)
  

    dat_pd = dat.groupby('date', 'sentiment').sum().toPandas()
    dat_pd = dat_pd.sort_values(by = ['date', 'sentiment'])
    dat_pd.columns = ['date'] + media_col_ls[1:]
  
    for m in media_ls:
        dat_pd[m] = dat_pd[m]*np.where(dat_pd['sentiment'] == 'positive', sentiment_weight[0], np.where(dat_pd['sentiment'] == 'negative', sentiment_weight[2], sentiment_weight[1]))
  
    dat_pd = dat_pd.groupby('date').sum().reset_index()
    dat_pd.columns = ['date', 'score'] + media_ls
  
    return dat_pd
In [0]:
media_nlp_pd = media_heat(media_nlp)
In [0]:
#change column name, generate the final heat table of each media
media_heat_pd = media_nlp_pd[['date']+media_ls]
media_rename_ls = ['date'] + [i[6:] for i in media_ls[0:6]] + [j[7:] for j in media_ls[6:]]
media_heat_pd.columns = media_rename_ls
media_heat_pd
date black widow shang-chi eternals spider-man doctor strange thor wandavision the falcon and the winter soldier loki what if hawkeye moon knight ms marvel she-hulk
0 2021-01-01 13 4 7 34 7 40 33 1 24 4 12 4 2 4
1 2021-01-02 34 5 10 50 7 58 33 3 25 9 22 7 15 13
2 2021-01-03 32 8 11 54 14 181 32 0 43 7 11 2 9 22
3 2021-01-04 16 6 3 52 8 61 34 2 32 5 16 2 11 4
4 2021-01-05 21 6 9 55 22 43 78 1 45 23 20 3 11 11
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
603 2022-08-27 57 64 51 183 58 224 95 0 110 36 86 66 93 518
604 2022-08-28 46 55 39 151 76 164 162 3 211 121 146 131 140 262
605 2022-08-29 19 7 13 145 18 145 31 0 63 30 35 34 26 119
606 2022-08-30 31 49 31 133 21 190 32 2 60 22 34 33 64 246
607 2022-08-31 24 51 33 135 31 143 31 1 67 13 36 46 38 167

608 rows × 15 columns

In [0]:
## visualize
def visulize_media_heat(dat, timeline, topn = 10):
    rank_df = dat.sum().sort_values(ascending = False)
    char_chose = list(rank_df[:topn].index)
    dat_viz = dat[['date']+char_chose]
    dat_viz = pd.melt(dat_viz, id_vars = ['date'], value_vars = char_chose)
    dat_viz.columns = ['date', 'media', 'heat']

    fig = px.line(dat_viz, x = 'date', y = 'heat', color = 'media', width = 1100, height = 600, title = 'Discussion Heat of Top 10 Marval Medias on Reddit')
    fig.update_layout(plot_bgcolor = '#FCFBF8', margin = dict(t = 70, l = 20, b = 20, r = 20), font_family = 'Roboto Slab', xaxis_title = 'Date',
                    yaxis_title = 'Heat', title = {'font': {'size':24}})

    for index, row in timeline.iterrows():
        dt = row['release_date']
        pos = int((datetime(dt.year, dt.month, dt.day) - datetime(1970,1,1)).total_seconds()) * 1000
        fig.add_vline(x = pos, line_width = 1, line_dash = 'dot', opacity = 0.7)
        fig.add_annotation(x = pos+1, y = rd.randint(2000, 4000), text = row['media_name'], showarrow = True, arrowhead = 1)

    fig.show()
    fig.write_html("media_heat_viz.html")
In [0]:
visulize_media_heat(dat = media_heat_pd, timeline = media_timeline_pd, topn = 10)
<command-1027951213072273>:3: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

Character NLP & Visualization¶

1. Determine the most popular Marvel characters (superhero/villain and different gender) in 2021-2022 Aug and their attributes.¶

In [0]:
#the most popular marvel character over all
df_char.createOrReplaceTempView("character")
spark.sql("SELECT * FROM character order by mention_count desc limit 1").show()
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
| name|intelligence|strength|speed|durability|power|combat|gender|  race|alignment|mention_count|
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
|wanda|         100|      10|   29|        70|  100|    80|Female|Mutant|      bad|       186333|
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+

In [0]:
#the most popular female character
spark.sql("SELECT * FROM character where gender like 'Female' order by mention_count desc limit 1").show()
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
| name|intelligence|strength|speed|durability|power|combat|gender|  race|alignment|mention_count|
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
|wanda|         100|      10|   29|        70|  100|    80|Female|Mutant|      bad|       186333|
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+

In [0]:
#the most popular male character
spark.sql("SELECT * FROM character where gender like 'Male' order by mention_count desc limit 1").show()
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
|      name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count|
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
|spider-man|          90|      55|   67|        75|   74|    85|  Male|Human|     good|       146361|
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+

In [0]:
#the most popular hero character
spark.sql("SELECT * FROM character where alignment like 'good' order by mention_count desc limit 1").show()
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
|      name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count|
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
|spider-man|          90|      55|   67|        75|   74|    85|  Male|Human|     good|       146361|
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+

In [0]:
#the most popular villain character
spark.sql("SELECT * FROM character where alignment like 'bad' order by mention_count desc limit 1").show()
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
| name|intelligence|strength|speed|durability|power|combat|gender|  race|alignment|mention_count|
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+
|wanda|         100|      10|   29|        70|  100|    80|Female|Mutant|      bad|       186333|
+-----+------------+--------+-----+----------+-----+------+------+------+---------+-------------+

In [0]:
#the most popular human character
spark.sql("SELECT * FROM character where race like 'Human' order by mention_count desc limit 1").show()
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
|      name|intelligence|strength|speed|durability|power|combat|gender| race|alignment|mention_count|
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+
|spider-man|          90|      55|   67|        75|   74|    85|  Male|Human|     good|       146361|
+----------+------------+--------+-----+----------+-----+------+------+-----+---------+-------------+

2. Observe the fluctuation of heat of discussion/sentiment/reviews towards characters¶

In [0]:
# select the columns for characters analysis
char_ls = [i for i in sub_com_nlp.columns if i[:9] == 'character']
col_ls = ['created_ts', 'score', 'sentiment'] + char_ls
char_nlp = sub_com_nlp[col_ls]
# extract the date from created_ts
char_nlp = char_nlp.withColumn('date', to_date('created_ts'))
# cast the data type of the character columns to integer
for col_name in char_ls:
  char_nlp = char_nlp.withColumn(col_name, col(col_name).cast('int'))
In [0]:
def char_heat(dat, score_weight = 0, sentiment_weight = [1,1,1]):
  """
  calculate the discussion heat of characters, return a pd.dataframe
  dat: pyspark df, have to include date and sentiment column, timestamp is stripped from date
  score_weight: >= 0, if set > 0, will give score a weight and assign this weight together with score to each comment, default is 0
  sentiment_weight: list, contains weight on the sentiment for positive, neutral, negative; 1,1,1 will add up all the comments
  """
  if score_weight != 0:
    for col_name in char_ls:
      dat = dat.withColumn(col_name, col(col_name)*col(score)*score_weight)
      
  dat_pd = dat.groupby('date', 'sentiment').sum().toPandas()
  dat_pd = dat_pd.sort_values(by = ['date', 'sentiment'])
  dat_pd.columns = ['date'] + col_ls[1:]
  
  for char in char_ls:
    dat_pd[char] = dat_pd[char]*np.where(dat_pd['sentiment'] == 'positive', sentiment_weight[0], np.where(dat_pd['sentiment'] == 'negative', sentiment_weight[2], sentiment_weight[1]))
  
  dat_pd = dat_pd.groupby('date').sum().reset_index()
  dat_pd.columns = ['date', 'score'] + char_ls
  
  return dat_pd
In [0]:
char_nlp_pd = char_heat(char_nlp)
In [0]:
char_heat_pd = char_nlp_pd[['date']+char_ls]
rename_ls = ['date'] + [i[10:] for i in char_ls]
char_heat_pd.columns = rename_ls
In [0]:
## visualize
def visulize_char_heat(dat, timeline, topn = 10):
  
  rank_df = dat.sum().sort_values(ascending = False)
  char_chose = list(rank_df[:10].index)
  dat_viz = dat[['date']+char_chose]
  dat_viz = pd.melt(dat_viz, id_vars = ['date'], value_vars = char_chose)
  dat_viz.columns = ['date', 'character', 'heat']
  
  fig = px.line(dat_viz, x = 'date', y = 'heat', color = 'character', width = 1100, height = 600, title = 'Discussion Heat of Marval Characters on Reddit')
  fig.update_layout(plot_bgcolor = '#FCFBF8', margin = dict(t = 70, l = 20, b = 20, r = 20), font_family = 'Roboto Slab', xaxis_title = 'Date',
                    yaxis_title = 'Heat', title = {'font': {'size':24}})
  
  for index, row in timeline.iterrows():
      dt = row['release_date']
      pos = int((datetime(dt.year, dt.month, dt.day) - datetime(1970,1,1)).total_seconds()) * 1000
      fig.add_vline(x = pos, line_width = 1, line_dash = 'dot', opacity = 0.7)
      fig.add_annotation(x = pos+1, y = rd.randint(2000, 4000), text = row['media_name'], showarrow = True, arrowhead = 1)
      
  fig.show()
  fig.write_html("char_heat_viz.html")
In [0]:
visulize_char_heat(dat = char_heat_pd, timeline = media_timeline_pd, topn = 10)
<command-1027951213072224>:4: FutureWarning:

Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

3. Track the most co-related character roles that people always mention together which means the higher of the correlation the higher audience of these two roles¶

In [0]:
import numpy as np

character_columns = [c for c in sub_com_nlp.columns if c.startswith('character:')]
num_character = len(character_columns)
char_cooccur_count = np.zeros((num_character, num_character))
char_cooccur_positve_perc = np.zeros((num_character, num_character))
char_cooccur_score = np.zeros((num_character, num_character))
In [0]:
num_steps_required = int(num_character * (num_character + 1) / 2)
acc = 0

for i in range(num_character):
  for j in range(i, num_character):
    char1 = character_columns[i]
    char2 = character_columns[j]
    if i == j:
      df_temp = sub_com_nlp.filter((col(char1)))
    else:
      df_temp = sub_com_nlp.filter((col(char1)) & (col(char2)))
    
    acc += 1
    print(char1, char2, str(acc)+'/'+str(num_steps_required)+' '*20, end='\r', sep=' | ')
    count = df_temp.count()
    if count == 0:
      positive_perc = 0
      score_avg = 0
    else:
      sentiment_summary = df_temp.groupby('sentiment').count()
      positive_perc = sentiment_summary.withColumn('perc', (col('count') / count)).filter(col('sentiment')=='positive').collect()
      if len(positive_perc) > 0:
        positive_perc = positive_perc[0]['perc']
      else:
        positive_perc = 0.0
      score_avg = df_temp.agg({'score': 'avg'}).collect()[0]['avg(score)']
    char_cooccur_count[i, j] = char_cooccur_count[j, i] = count
    char_cooccur_positve_perc[i, j] = char_cooccur_positve_perc[j, i] = positive_perc
    char_cooccur_score[i, j] = char_cooccur_score[j, i] = score_avg
character:black widow | character:black widow | 1/1275                    
character:black widow | character:yelena belova | 2/1275                    
character:black widow | character:shang-chi | 3/1275                    
character:black widow | character:sersi | 4/1275                    
character:black widow | character:ikaris | 5/1275                    
character:black widow | character:thena | 6/1275                    
character:black widow | character:ajak | 7/1275                    
character:black widow | character:spider-man | 8/1275                    
character:black widow | character:doctor strange | 9/1275                    
character:black widow | character:electro | 10/1275                    
character:black widow | character:green goblin | 11/1275                    
character:black widow | character:doc ock | 12/1275                    
character:black widow | character:wong | 13/1275                    
character:black widow | character:wanda | 14/1275                    
character:black widow | character:thor | 15/1275                    
character:black widow | character:jane foster | 16/1275                    
character:black widow | character:gorr | 17/1275                    
character:black widow | character:vision | 18/1275                    
character:black widow | character:agnes | 19/1275                    
character:black widow | character:falcon | 20/1275                    
character:black widow | character:bucky | 21/1275                    
character:black widow | character:john walker | 22/1275                    
character:black widow | character:captain america | 23/1275                    
character:black widow | character:loki | 24/1275                    
character:black widow | character:casey | 25/1275                    
character:black widow | character:the watcher | 26/1275                    
character:black widow | character:nick fury | 27/1275                    
character:black widow | character:iron man | 28/1275                    
character:black widow | character:hawkeye | 29/1275                    
character:black widow | character:ultron | 30/1275                    
character:black widow | character:red skull | 31/1275                    
character:black widow | character:captain marvel | 32/1275                    
character:black widow | character:captain carter | 33/1275                    
character:black widow | character:hulk | 34/1275                    
character:black widow | character:nebula | 35/1275                    
character:black widow | character:hank pym | 36/1275                    
character:black widow | character:ant-man | 37/1275                    
character:black widow | character:thanos | 38/1275                    
character:black widow | character:kate bishop | 39/1275                    
character:black widow | character:kingpin | 40/1275                    
character:black widow | character:moon knight | 41/1275                    
character:black widow | character:arthur harrow | 42/1275                    
character:black widow | character:kamala khan | 43/1275                    
character:black widow | character:she-hulk | 44/1275                    
character:black widow | character:abomination | 45/1275                    
character:black widow | character:odin | 46/1275                    
character:black widow | character:pepper potts | 47/1275                    
character:black widow | character:mj | 48/1275                    
character:black widow | character:ned | 49/1275                    
character:black widow | character:happy | 50/1275                    
character:yelena belova | character:yelena belova | 51/1275                    
character:yelena belova | character:shang-chi | 52/1275                    
character:yelena belova | character:sersi | 53/1275                    
character:yelena belova | character:ikaris | 54/1275                    
character:yelena belova | character:thena | 55/1275                    
character:yelena belova | character:ajak | 56/1275                    
character:yelena belova | character:spider-man | 57/1275                    
character:yelena belova | character:doctor strange | 58/1275                    
character:yelena belova | character:electro | 59/1275                    
character:yelena belova | character:green goblin | 60/1275                    
character:yelena belova | character:doc ock | 61/1275                    
character:yelena belova | character:wong | 62/1275                    
character:yelena belova | character:wanda | 63/1275                    
character:yelena belova | character:thor | 64/1275                    
character:yelena belova | character:jane foster | 65/1275                    
character:yelena belova | character:gorr | 66/1275                    
character:yelena belova | character:vision | 67/1275                    
character:yelena belova | character:agnes | 68/1275                    
character:yelena belova | character:falcon | 69/1275                    
character:yelena belova | character:bucky | 70/1275                    
character:yelena belova | character:john walker | 71/1275                    
character:yelena belova | character:captain america | 72/1275                    
character:yelena belova | character:loki | 73/1275                    
character:yelena belova | character:casey | 74/1275                    
character:yelena belova | character:the watcher | 75/1275                    
character:yelena belova | character:nick fury | 76/1275                    
character:yelena belova | character:iron man | 77/1275                    
character:yelena belova | character:hawkeye | 78/1275                    
character:yelena belova | character:ultron | 79/1275                    
character:yelena belova | character:red skull | 80/1275                    
character:yelena belova | character:captain marvel | 81/1275                    
character:yelena belova | character:captain carter | 82/1275                    
character:yelena belova | character:hulk | 83/1275                    
character:yelena belova | character:nebula | 84/1275                    
character:yelena belova | character:hank pym | 85/1275                    
character:yelena belova | character:ant-man | 86/1275                    
character:yelena belova | character:thanos | 87/1275                    
character:yelena belova | character:kate bishop | 88/1275                    
character:yelena belova | character:kingpin | 89/1275                    
character:yelena belova | character:moon knight | 90/1275                    
character:yelena belova | character:arthur harrow | 91/1275                    
character:yelena belova | character:kamala khan | 92/1275                    
character:yelena belova | character:she-hulk | 93/1275                    
character:yelena belova | character:abomination | 94/1275                    
character:yelena belova | character:odin | 95/1275                    
character:yelena belova | character:pepper potts | 96/1275                    
character:yelena belova | character:mj | 97/1275                    
character:yelena belova | character:ned | 98/1275                    
character:yelena belova | character:happy | 99/1275                    
character:shang-chi | character:shang-chi | 100/1275                    
character:shang-chi | character:sersi | 101/1275                    
character:shang-chi | character:ikaris | 102/1275                    
character:shang-chi | character:thena | 103/1275                    
character:shang-chi | character:ajak | 104/1275                    
character:shang-chi | character:spider-man | 105/1275                    
character:shang-chi | character:doctor strange | 106/1275                    
character:shang-chi | character:electro | 107/1275                    
character:shang-chi | character:green goblin | 108/1275                    
character:shang-chi | character:doc ock | 109/1275                    
character:shang-chi | character:wong | 110/1275                    
character:shang-chi | character:wanda | 111/1275                    
character:shang-chi | character:thor | 112/1275                    
character:shang-chi | character:jane foster | 113/1275                    
character:shang-chi | character:gorr | 114/1275                    
character:shang-chi | character:vision | 115/1275                    
character:shang-chi | character:agnes | 116/1275                    
character:shang-chi | character:falcon | 117/1275                    
character:shang-chi | character:bucky | 118/1275                    
character:shang-chi | character:john walker | 119/1275                    
character:shang-chi | character:captain america | 120/1275                    
character:shang-chi | character:loki | 121/1275                    
character:shang-chi | character:casey | 122/1275                    
character:shang-chi | character:the watcher | 123/1275                    
character:shang-chi | character:nick fury | 124/1275                    
character:shang-chi | character:iron man | 125/1275                    
character:shang-chi | character:hawkeye | 126/1275                    
character:shang-chi | character:ultron | 127/1275                    
character:shang-chi | character:red skull | 128/1275                    
character:shang-chi | character:captain marvel | 129/1275                    
character:shang-chi | character:captain carter | 130/1275                    
character:shang-chi | character:hulk | 131/1275                    
character:shang-chi | character:nebula | 132/1275                    
character:shang-chi | character:hank pym | 133/1275                    
character:shang-chi | character:ant-man | 134/1275                    
character:shang-chi | character:thanos | 135/1275                    
character:shang-chi | character:kate bishop | 136/1275                    
character:shang-chi | character:kingpin | 137/1275                    
character:shang-chi | character:moon knight | 138/1275                    
character:shang-chi | character:arthur harrow | 139/1275                    
character:shang-chi | character:kamala khan | 140/1275                    
character:shang-chi | character:she-hulk | 141/1275                    
character:shang-chi | character:abomination | 142/1275                    
character:shang-chi | character:odin | 143/1275                    
character:shang-chi | character:pepper potts | 144/1275                    
character:shang-chi | character:mj | 145/1275                    
character:shang-chi | character:ned | 146/1275                    
character:shang-chi | character:happy | 147/1275                    
character:sersi | character:sersi | 148/1275                    
character:sersi | character:ikaris | 149/1275                    
character:sersi | character:thena | 150/1275                    
character:sersi | character:ajak | 151/1275                    
character:sersi | character:spider-man | 152/1275                    
character:sersi | character:doctor strange | 153/1275                    
character:sersi | character:electro | 154/1275                    
character:sersi | character:green goblin | 155/1275                    
character:sersi | character:doc ock | 156/1275                    
character:sersi | character:wong | 157/1275                    
character:sersi | character:wanda | 158/1275                    
character:sersi | character:thor | 159/1275                    
character:sersi | character:jane foster | 160/1275                    
character:sersi | character:gorr | 161/1275                    
character:sersi | character:vision | 162/1275                    
character:sersi | character:agnes | 163/1275                    
character:sersi | character:falcon | 164/1275                    
character:sersi | character:bucky | 165/1275                    
character:sersi | character:john walker | 166/1275                    
character:sersi | character:captain america | 167/1275                    
character:sersi | character:loki | 168/1275                    
character:sersi | character:casey | 169/1275                    
character:sersi | character:the watcher | 170/1275                    
character:sersi | character:nick fury | 171/1275                    
character:sersi | character:iron man | 172/1275                    
character:sersi | character:hawkeye | 173/1275                    
character:sersi | character:ultron | 174/1275                    
character:sersi | character:red skull | 175/1275                    
character:sersi | character:captain marvel | 176/1275                    
character:sersi | character:captain carter | 177/1275                    
character:sersi | character:hulk | 178/1275                    
character:sersi | character:nebula | 179/1275                    
character:sersi | character:hank pym | 180/1275                    
character:sersi | character:ant-man | 181/1275                    
character:sersi | character:thanos | 182/1275                    
character:sersi | character:kate bishop | 183/1275                    
character:sersi | character:kingpin | 184/1275                    
character:sersi | character:moon knight | 185/1275                    
character:sersi | character:arthur harrow | 186/1275                    
character:sersi | character:kamala khan | 187/1275                    
character:sersi | character:she-hulk | 188/1275                    
character:sersi | character:abomination | 189/1275                    
character:sersi | character:odin | 190/1275                    
character:sersi | character:pepper potts | 191/1275                    
character:sersi | character:mj | 192/1275                    
character:sersi | character:ned | 193/1275                    
character:sersi | character:happy | 194/1275                    
character:ikaris | character:ikaris | 195/1275                    
character:ikaris | character:thena | 196/1275                    
character:ikaris | character:ajak | 197/1275                    
character:ikaris | character:spider-man | 198/1275                    
character:ikaris | character:doctor strange | 199/1275                    
character:ikaris | character:electro | 200/1275                    
character:ikaris | character:green goblin | 201/1275                    
character:ikaris | character:doc ock | 202/1275                    
character:ikaris | character:wong | 203/1275                    
character:ikaris | character:wanda | 204/1275                    
character:ikaris | character:thor | 205/1275                    
character:ikaris | character:jane foster | 206/1275                    
character:ikaris | character:gorr | 207/1275                    
character:ikaris | character:vision | 208/1275                    
character:ikaris | character:agnes | 209/1275                    
character:ikaris | character:falcon | 210/1275                    
character:ikaris | character:bucky | 211/1275                    
character:ikaris | character:john walker | 212/1275                    
character:ikaris | character:captain america | 213/1275                    
character:ikaris | character:loki | 214/1275                    
character:ikaris | character:casey | 215/1275                    
character:ikaris | character:the watcher | 216/1275                    
character:ikaris | character:nick fury | 217/1275                    
character:ikaris | character:iron man | 218/1275                    
character:ikaris | character:hawkeye | 219/1275                    
character:ikaris | character:ultron | 220/1275                    
character:ikaris | character:red skull | 221/1275                    
character:ikaris | character:captain marvel | 222/1275                    
character:ikaris | character:captain carter | 223/1275                    
character:ikaris | character:hulk | 224/1275                    
character:ikaris | character:nebula | 225/1275                    
character:ikaris | character:hank pym | 226/1275                    
character:ikaris | character:ant-man | 227/1275                    
character:ikaris | character:thanos | 228/1275                    
character:ikaris | character:kate bishop | 229/1275                    
character:ikaris | character:kingpin | 230/1275                    
character:ikaris | character:moon knight | 231/1275                    
character:ikaris | character:arthur harrow | 232/1275                    
character:ikaris | character:kamala khan | 233/1275                    
character:ikaris | character:she-hulk | 234/1275                    
character:ikaris | character:abomination | 235/1275                    
character:ikaris | character:odin | 236/1275                    
character:ikaris | character:pepper potts | 237/1275                    
character:ikaris | character:mj | 238/1275                    
character:ikaris | character:ned | 239/1275                    
character:ikaris | character:happy | 240/1275                    
character:thena | character:thena | 241/1275                    
character:thena | character:ajak | 242/1275                    
character:thena | character:spider-man | 243/1275                    
character:thena | character:doctor strange | 244/1275                    
character:thena | character:electro | 245/1275                    
character:thena | character:green goblin | 246/1275                    
character:thena | character:doc ock | 247/1275                    
character:thena | character:wong | 248/1275                    
character:thena | character:wanda | 249/1275                    
character:thena | character:thor | 250/1275                    
character:thena | character:jane foster | 251/1275                    
character:thena | character:gorr | 252/1275                    
character:thena | character:vision | 253/1275                    
character:thena | character:agnes | 254/1275                    
character:thena | character:falcon | 255/1275                    
character:thena | character:bucky | 256/1275                    
character:thena | character:john walker | 257/1275                    
character:thena | character:captain america | 258/1275                    
character:thena | character:loki | 259/1275                    
character:thena | character:casey | 260/1275                    
character:thena | character:the watcher | 261/1275                    
character:thena | character:nick fury | 262/1275                    
character:thena | character:iron man | 263/1275                    
character:thena | character:hawkeye | 264/1275                    
character:thena | character:ultron | 265/1275                    
character:thena | character:red skull | 266/1275                    
character:thena | character:captain marvel | 267/1275                    
character:thena | character:captain carter | 268/1275                    
character:thena | character:hulk | 269/1275                    
character:thena | character:nebula | 270/1275                    
character:thena | character:hank pym | 271/1275                    
character:thena | character:ant-man | 272/1275                    
character:thena | character:thanos | 273/1275                    
character:thena | character:kate bishop | 274/1275                    
character:thena | character:kingpin | 275/1275                    
character:thena | character:moon knight | 276/1275                    
character:thena | character:arthur harrow | 277/1275                    
character:thena | character:kamala khan | 278/1275                    
character:thena | character:she-hulk | 279/1275                    
character:thena | character:abomination | 280/1275                    
character:thena | character:odin | 281/1275                    
character:thena | character:pepper potts | 282/1275                    
character:thena | character:mj | 283/1275                    
character:thena | character:ned | 284/1275                    
character:thena | character:happy | 285/1275                    
character:ajak | character:ajak | 286/1275                    
character:ajak | character:spider-man | 287/1275                    
character:ajak | character:doctor strange | 288/1275                    
character:ajak | character:electro | 289/1275                    
character:ajak | character:green goblin | 290/1275                    
character:ajak | character:doc ock | 291/1275                    
character:ajak | character:wong | 292/1275                    
character:ajak | character:wanda | 293/1275                    
character:ajak | character:thor | 294/1275                    
character:ajak | character:jane foster | 295/1275                    
character:ajak | character:gorr | 296/1275                    
character:ajak | character:vision | 297/1275                    
character:ajak | character:agnes | 298/1275                    
character:ajak | character:falcon | 299/1275                    
character:ajak | character:bucky | 300/1275                    
character:ajak | character:john walker | 301/1275                    
character:ajak | character:captain america | 302/1275                    
character:ajak | character:loki | 303/1275                    
character:ajak | character:casey | 304/1275                    
character:ajak | character:the watcher | 305/1275                    
character:ajak | character:nick fury | 306/1275                    
character:ajak | character:iron man | 307/1275                    
character:ajak | character:hawkeye | 308/1275                    
character:ajak | character:ultron | 309/1275                    
character:ajak | character:red skull | 310/1275                    
character:ajak | character:captain marvel | 311/1275                    
character:ajak | character:captain carter | 312/1275                    
character:ajak | character:hulk | 313/1275                    
character:ajak | character:nebula | 314/1275                    
character:ajak | character:hank pym | 315/1275                    
character:ajak | character:ant-man | 316/1275                    
character:ajak | character:thanos | 317/1275                    
character:ajak | character:kate bishop | 318/1275                    
character:ajak | character:kingpin | 319/1275                    
character:ajak | character:moon knight | 320/1275                    
character:ajak | character:arthur harrow | 321/1275                    
character:ajak | character:kamala khan | 322/1275                    
character:ajak | character:she-hulk | 323/1275                    
character:ajak | character:abomination | 324/1275                    
character:ajak | character:odin | 325/1275                    
character:ajak | character:pepper potts | 326/1275                    
character:ajak | character:mj | 327/1275                    
character:ajak | character:ned | 328/1275                    
character:ajak | character:happy | 329/1275                    
character:spider-man | character:spider-man | 330/1275                    
character:spider-man | character:doctor strange | 331/1275                    
character:spider-man | character:electro | 332/1275                    
character:spider-man | character:green goblin | 333/1275                    
character:spider-man | character:doc ock | 334/1275                    
character:spider-man | character:wong | 335/1275                    
character:spider-man | character:wanda | 336/1275                    
character:spider-man | character:thor | 337/1275                    
character:spider-man | character:jane foster | 338/1275                    
character:spider-man | character:gorr | 339/1275                    
character:spider-man | character:vision | 340/1275                    
character:spider-man | character:agnes | 341/1275                    
character:spider-man | character:falcon | 342/1275                    
character:spider-man | character:bucky | 343/1275                    
character:spider-man | character:john walker | 344/1275                    
character:spider-man | character:captain america | 345/1275                    
character:spider-man | character:loki | 346/1275                    
character:spider-man | character:casey | 347/1275                    
character:spider-man | character:the watcher | 348/1275                    
character:spider-man | character:nick fury | 349/1275                    
character:spider-man | character:iron man | 350/1275                    
character:spider-man | character:hawkeye | 351/1275                    
character:spider-man | character:ultron | 352/1275                    
character:spider-man | character:red skull | 353/1275                    
character:spider-man | character:captain marvel | 354/1275                    
character:spider-man | character:captain carter | 355/1275                    
character:spi

*** WARNING: max output size exceeded, skipping output. ***

eye | 929/1275                    
character:casey | character:ultron | 930/1275                    
character:casey | character:red skull | 931/1275                    
character:casey | character:captain marvel | 932/1275                    
character:casey | character:captain carter | 933/1275                    
character:casey | character:hulk | 934/1275                    
character:casey | character:nebula | 935/1275                    
character:casey | character:hank pym | 936/1275                    
character:casey | character:ant-man | 937/1275                    
character:casey | character:thanos | 938/1275                    
character:casey | character:kate bishop | 939/1275                    
character:casey | character:kingpin | 940/1275                    
character:casey | character:moon knight | 941/1275                    
character:casey | character:arthur harrow | 942/1275                    
character:casey | character:kamala khan | 943/1275                    
character:casey | character:she-hulk | 944/1275                    
character:casey | character:abomination | 945/1275                    
character:casey | character:odin | 946/1275                    
character:casey | character:pepper potts | 947/1275                    
character:casey | character:mj | 948/1275                    
character:casey | character:ned | 949/1275                    
character:casey | character:happy | 950/1275                    
character:the watcher | character:the watcher | 951/1275                    
character:the watcher | character:nick fury | 952/1275                    
character:the watcher | character:iron man | 953/1275                    
character:the watcher | character:hawkeye | 954/1275                    
character:the watcher | character:ultron | 955/1275                    
character:the watcher | character:red skull | 956/1275                    
character:the watcher | character:captain marvel | 957/1275                    
character:the watcher | character:captain carter | 958/1275                    
character:the watcher | character:hulk | 959/1275                    
character:the watcher | character:nebula | 960/1275                    
character:the watcher | character:hank pym | 961/1275                    
character:the watcher | character:ant-man | 962/1275                    
character:the watcher | character:thanos | 963/1275                    
character:the watcher | character:kate bishop | 964/1275                    
character:the watcher | character:kingpin | 965/1275                    
character:the watcher | character:moon knight | 966/1275                    
character:the watcher | character:arthur harrow | 967/1275                    
character:the watcher | character:kamala khan | 968/1275                    
character:the watcher | character:she-hulk | 969/1275                    
character:the watcher | character:abomination | 970/1275                    
character:the watcher | character:odin | 971/1275                    
character:the watcher | character:pepper potts | 972/1275                    
character:the watcher | character:mj | 973/1275                    
character:the watcher | character:ned | 974/1275                    
character:the watcher | character:happy | 975/1275                    
character:nick fury | character:nick fury | 976/1275                    
character:nick fury | character:iron man | 977/1275                    
character:nick fury | character:hawkeye | 978/1275                    
character:nick fury | character:ultron | 979/1275                    
character:nick fury | character:red skull | 980/1275                    
character:nick fury | character:captain marvel | 981/1275                    
character:nick fury | character:captain carter | 982/1275                    
character:nick fury | character:hulk | 983/1275                    
character:nick fury | character:nebula | 984/1275                    
character:nick fury | character:hank pym | 985/1275                    
character:nick fury | character:ant-man | 986/1275                    
character:nick fury | character:thanos | 987/1275                    
character:nick fury | character:kate bishop | 988/1275                    
character:nick fury | character:kingpin | 989/1275                    
character:nick fury | character:moon knight | 990/1275                    
character:nick fury | character:arthur harrow | 991/1275                    
character:nick fury | character:kamala khan | 992/1275                    
character:nick fury | character:she-hulk | 993/1275                    
character:nick fury | character:abomination | 994/1275                    
character:nick fury | character:odin | 995/1275                    
character:nick fury | character:pepper potts | 996/1275                    
character:nick fury | character:mj | 997/1275                    
character:nick fury | character:ned | 998/1275                    
character:nick fury | character:happy | 999/1275                    
character:iron man | character:iron man | 1000/1275                    
character:iron man | character:hawkeye | 1001/1275                    
character:iron man | character:ultron | 1002/1275                    
character:iron man | character:red skull | 1003/1275                    
character:iron man | character:captain marvel | 1004/1275                    
character:iron man | character:captain carter | 1005/1275                    
character:iron man | character:hulk | 1006/1275                    
character:iron man | character:nebula | 1007/1275                    
character:iron man | character:hank pym | 1008/1275                    
character:iron man | character:ant-man | 1009/1275                    
character:iron man | character:thanos | 1010/1275                    
character:iron man | character:kate bishop | 1011/1275                    
character:iron man | character:kingpin | 1012/1275                    
character:iron man | character:moon knight | 1013/1275                    
character:iron man | character:arthur harrow | 1014/1275                    
character:iron man | character:kamala khan | 1015/1275                    
character:iron man | character:she-hulk | 1016/1275                    
character:iron man | character:abomination | 1017/1275                    
character:iron man | character:odin | 1018/1275                    
character:iron man | character:pepper potts | 1019/1275                    
character:iron man | character:mj | 1020/1275                    
character:iron man | character:ned | 1021/1275                    
character:iron man | character:happy | 1022/1275                    
character:hawkeye | character:hawkeye | 1023/1275                    
character:hawkeye | character:ultron | 1024/1275                    
character:hawkeye | character:red skull | 1025/1275                    
character:hawkeye | character:captain marvel | 1026/1275                    
character:hawkeye | character:captain carter | 1027/1275                    
character:hawkeye | character:hulk | 1028/1275                    
character:hawkeye | character:nebula | 1029/1275                    
character:hawkeye | character:hank pym | 1030/1275                    
character:hawkeye | character:ant-man | 1031/1275                    
character:hawkeye | character:thanos | 1032/1275                    
character:hawkeye | character:kate bishop | 1033/1275                    
character:hawkeye | character:kingpin | 1034/1275                    
character:hawkeye | character:moon knight | 1035/1275                    
character:hawkeye | character:arthur harrow | 1036/1275                    
character:hawkeye | character:kamala khan | 1037/1275                    
character:hawkeye | character:she-hulk | 1038/1275                    
character:hawkeye | character:abomination | 1039/1275                    
character:hawkeye | character:odin | 1040/1275                    
character:hawkeye | character:pepper potts | 1041/1275                    
character:hawkeye | character:mj | 1042/1275                    
character:hawkeye | character:ned | 1043/1275                    
character:hawkeye | character:happy | 1044/1275                    
character:ultron | character:ultron | 1045/1275                    
character:ultron | character:red skull | 1046/1275                    
character:ultron | character:captain marvel | 1047/1275                    
character:ultron | character:captain carter | 1048/1275                    
character:ultron | character:hulk | 1049/1275                    
character:ultron | character:nebula | 1050/1275                    
character:ultron | character:hank pym | 1051/1275                    
character:ultron | character:ant-man | 1052/1275                    
character:ultron | character:thanos | 1053/1275                    
character:ultron | character:kate bishop | 1054/1275                    
character:ultron | character:kingpin | 1055/1275                    
character:ultron | character:moon knight | 1056/1275                    
character:ultron | character:arthur harrow | 1057/1275                    
character:ultron | character:kamala khan | 1058/1275                    
character:ultron | character:she-hulk | 1059/1275                    
character:ultron | character:abomination | 1060/1275                    
character:ultron | character:odin | 1061/1275                    
character:ultron | character:pepper potts | 1062/1275                    
character:ultron | character:mj | 1063/1275                    
character:ultron | character:ned | 1064/1275                    
character:ultron | character:happy | 1065/1275                    
character:red skull | character:red skull | 1066/1275                    
character:red skull | character:captain marvel | 1067/1275                    
character:red skull | character:captain carter | 1068/1275                    
character:red skull | character:hulk | 1069/1275                    
character:red skull | character:nebula | 1070/1275                    
character:red skull | character:hank pym | 1071/1275                    
character:red skull | character:ant-man | 1072/1275                    
character:red skull | character:thanos | 1073/1275                    
character:red skull | character:kate bishop | 1074/1275                    
character:red skull | character:kingpin | 1075/1275                    
character:red skull | character:moon knight | 1076/1275                    
character:red skull | character:arthur harrow | 1077/1275                    
character:red skull | character:kamala khan | 1078/1275                    
character:red skull | character:she-hulk | 1079/1275                    
character:red skull | character:abomination | 1080/1275                    
character:red skull | character:odin | 1081/1275                    
character:red skull | character:pepper potts | 1082/1275                    
character:red skull | character:mj | 1083/1275                    
character:red skull | character:ned | 1084/1275                    
character:red skull | character:happy | 1085/1275                    
character:captain marvel | character:captain marvel | 1086/1275                    
character:captain marvel | character:captain carter | 1087/1275                    
character:captain marvel | character:hulk | 1088/1275                    
character:captain marvel | character:nebula | 1089/1275                    
character:captain marvel | character:hank pym | 1090/1275                    
character:captain marvel | character:ant-man | 1091/1275                    
character:captain marvel | character:thanos | 1092/1275                    
character:captain marvel | character:kate bishop | 1093/1275                    
character:captain marvel | character:kingpin | 1094/1275                    
character:captain marvel | character:moon knight | 1095/1275                    
character:captain marvel | character:arthur harrow | 1096/1275                    
character:captain marvel | character:kamala khan | 1097/1275                    
character:captain marvel | character:she-hulk | 1098/1275                    
character:captain marvel | character:abomination | 1099/1275                    
character:captain marvel | character:odin | 1100/1275                    
character:captain marvel | character:pepper potts | 1101/1275                    
character:captain marvel | character:mj | 1102/1275                    
character:captain marvel | character:ned | 1103/1275                    
character:captain marvel | character:happy | 1104/1275                    
character:captain carter | character:captain carter | 1105/1275                    
character:captain carter | character:hulk | 1106/1275                    
character:captain carter | character:nebula | 1107/1275                    
character:captain carter | character:hank pym | 1108/1275                    
character:captain carter | character:ant-man | 1109/1275                    
character:captain carter | character:thanos | 1110/1275                    
character:captain carter | character:kate bishop | 1111/1275                    
character:captain carter | character:kingpin | 1112/1275                    
character:captain carter | character:moon knight | 1113/1275                    
character:captain carter | character:arthur harrow | 1114/1275                    
character:captain carter | character:kamala khan | 1115/1275                    
character:captain carter | character:she-hulk | 1116/1275                    
character:captain carter | character:abomination | 1117/1275                    
character:captain carter | character:odin | 1118/1275                    
character:captain carter | character:pepper potts | 1119/1275                    
character:captain carter | character:mj | 1120/1275                    
character:captain carter | character:ned | 1121/1275                    
character:captain carter | character:happy | 1122/1275                    
character:hulk | character:hulk | 1123/1275                    
character:hulk | character:nebula | 1124/1275                    
character:hulk | character:hank pym | 1125/1275                    
character:hulk | character:ant-man | 1126/1275                    
character:hulk | character:thanos | 1127/1275                    
character:hulk | character:kate bishop | 1128/1275                    
character:hulk | character:kingpin | 1129/1275                    
character:hulk | character:moon knight | 1130/1275                    
character:hulk | character:arthur harrow | 1131/1275                    
character:hulk | character:kamala khan | 1132/1275                    
character:hulk | character:she-hulk | 1133/1275                    
character:hulk | character:abomination | 1134/1275                    
character:hulk | character:odin | 1135/1275                    
character:hulk | character:pepper potts | 1136/1275                    
character:hulk | character:mj | 1137/1275                    
character:hulk | character:ned | 1138/1275                    
character:hulk | character:happy | 1139/1275                    
character:nebula | character:nebula | 1140/1275                    
character:nebula | character:hank pym | 1141/1275                    
character:nebula | character:ant-man | 1142/1275                    
character:nebula | character:thanos | 1143/1275                    
character:nebula | character:kate bishop | 1144/1275                    
character:nebula | character:kingpin | 1145/1275                    
character:nebula | character:moon knight | 1146/1275                    
character:nebula | character:arthur harrow | 1147/1275                    
character:nebula | character:kamala khan | 1148/1275                    
character:nebula | character:she-hulk | 1149/1275                    
character:nebula | character:abomination | 1150/1275                    
character:nebula | character:odin | 1151/1275                    
character:nebula | character:pepper potts | 1152/1275                    
character:nebula | character:mj | 1153/1275                    
character:nebula | character:ned | 1154/1275                    
character:nebula | character:happy | 1155/1275                    
character:hank pym | character:hank pym | 1156/1275                    
character:hank pym | character:ant-man | 1157/1275                    
character:hank pym | character:thanos | 1158/1275                    
character:hank pym | character:kate bishop | 1159/1275                    
character:hank pym | character:kingpin | 1160/1275                    
character:hank pym | character:moon knight | 1161/1275                    
character:hank pym | character:arthur harrow | 1162/1275                    
character:hank pym | character:kamala khan | 1163/1275                    
character:hank pym | character:she-hulk | 1164/1275                    
character:hank pym | character:abomination | 1165/1275                    
character:hank pym | character:odin | 1166/1275                    
character:hank pym | character:pepper potts | 1167/1275                    
character:hank pym | character:mj | 1168/1275                    
character:hank pym | character:ned | 1169/1275                    
character:hank pym | character:happy | 1170/1275                    
character:ant-man | character:ant-man | 1171/1275                    
character:ant-man | character:thanos | 1172/1275                    
character:ant-man | character:kate bishop | 1173/1275                    
character:ant-man | character:kingpin | 1174/1275                    
character:ant-man | character:moon knight | 1175/1275                    
character:ant-man | character:arthur harrow | 1176/1275                    
character:ant-man | character:kamala khan | 1177/1275                    
character:ant-man | character:she-hulk | 1178/1275                    
character:ant-man | character:abomination | 1179/1275                    
character:ant-man | character:odin | 1180/1275                    
character:ant-man | character:pepper potts | 1181/1275                    
character:ant-man | character:mj | 1182/1275                    
character:ant-man | character:ned | 1183/1275                    
character:ant-man | character:happy | 1184/1275                    
character:thanos | character:thanos | 1185/1275                    
character:thanos | character:kate bishop | 1186/1275                    
character:thanos | character:kingpin | 1187/1275                    
character:thanos | character:moon knight | 1188/1275                    
character:thanos | character:arthur harrow | 1189/1275                    
character:thanos | character:kamala khan | 1190/1275                    
character:thanos | character:she-hulk | 1191/1275                    
character:thanos | character:abomination | 1192/1275                    
character:thanos | character:odin | 1193/1275                    
character:thanos | character:pepper potts | 1194/1275                    
character:thanos | character:mj | 1195/1275                    
character:thanos | character:ned | 1196/1275                    
character:thanos | character:happy | 1197/1275                    
character:kate bishop | character:kate bishop | 1198/1275                    
character:kate bishop | character:kingpin | 1199/1275                    
character:kate bishop | character:moon knight | 1200/1275                    
character:kate bishop | character:arthur harrow | 1201/1275                    
character:kate bishop | character:kamala khan | 1202/1275                    
character:kate bishop | character:she-hulk | 1203/1275                    
character:kate bishop | character:abomination | 1204/1275                    
character:kate bishop | character:odin | 1205/1275                    
character:kate bishop | character:pepper potts | 1206/1275                    
character:kate bishop | character:mj | 1207/1275                    
character:kate bishop | character:ned | 1208/1275                    
character:kate bishop | character:happy | 1209/1275                    
character:kingpin | character:kingpin | 1210/1275                    
character:kingpin | character:moon knight | 1211/1275                    
character:kingpin | character:arthur harrow | 1212/1275                    
character:kingpin | character:kamala khan | 1213/1275                    
character:kingpin | character:she-hulk | 1214/1275                    
character:kingpin | character:abomination | 1215/1275                    
character:kingpin | character:odin | 1216/1275                    
character:kingpin | character:pepper potts | 1217/1275                    
character:kingpin | character:mj | 1218/1275                    
character:kingpin | character:ned | 1219/1275                    
character:kingpin | character:happy | 1220/1275                    
character:moon knight | character:moon knight | 1221/1275                    
character:moon knight | character:arthur harrow | 1222/1275                    
character:moon knight | character:kamala khan | 1223/1275                    
character:moon knight | character:she-hulk | 1224/1275                    
character:moon knight | character:abomination | 1225/1275                    
character:moon knight | character:odin | 1226/1275                    
character:moon knight | character:pepper potts | 1227/1275                    
character:moon knight | character:mj | 1228/1275                    
character:moon knight | character:ned | 1229/1275                    
character:moon knight | character:happy | 1230/1275                    
character:arthur harrow | character:arthur harrow | 1231/1275                    
character:arthur harrow | character:kamala khan | 1232/1275                    
character:arthur harrow | character:she-hulk | 1233/1275                    
character:arthur harrow | character:abomination | 1234/1275                    
character:arthur harrow | character:odin | 1235/1275                    
character:arthur harrow | character:pepper potts | 1236/1275                    
character:arthur harrow | character:mj | 1237/1275                    
character:arthur harrow | character:ned | 1238/1275                    
character:arthur harrow | character:happy | 1239/1275                    
character:kamala khan | character:kamala khan | 1240/1275                    
character:kamala khan | character:she-hulk | 1241/1275                    
character:kamala khan | character:abomination | 1242/1275                    
character:kamala khan | character:odin | 1243/1275                    
character:kamala khan | character:pepper potts | 1244/1275                    
character:kamala khan | character:mj | 1245/1275                    
character:kamala khan | character:ned | 1246/1275                    
character:kamala khan | character:happy | 1247/1275                    
character:she-hulk | character:she-hulk | 1248/1275                    
character:she-hulk | character:abomination | 1249/1275                    
character:she-hulk | character:odin | 1250/1275                    
character:she-hulk | character:pepper potts | 1251/1275                    
character:she-hulk | character:mj | 1252/1275                    
character:she-hulk | character:ned | 1253/1275                    
character:she-hulk | character:happy | 1254/1275                    
character:abomination | character:abomination | 1255/1275                    
character:abomination | character:odin | 1256/1275                    
character:abomination | character:pepper potts | 1257/1275                    
character:abomination | character:mj | 1258/1275                    
character:abomination | character:ned | 1259/1275                    
character:abomination | character:happy | 1260/1275                    
character:odin | character:odin | 1261/1275                    
character:odin | character:pepper potts | 1262/1275                    
character:odin | character:mj | 1263/1275                    
character:odin | character:ned | 1264/1275                    
character:odin | character:happy | 1265/1275                    
character:pepper potts | character:pepper potts | 1266/1275                    
character:pepper potts | character:mj | 1267/1275                    
character:pepper potts | character:ned | 1268/1275                    
character:pepper potts | character:happy | 1269/1275                    
character:mj | character:mj | 1270/1275                    
character:mj | character:ned | 1271/1275                    
character:mj | character:happy | 1272/1275                    
character:ned | character:ned | 1273/1275                    
character:ned | character:happy | 1274/1275                    
character:happy | character:happy | 1275/1275                    
In [0]:
# percentage of positive comments
plt.figure(figsize=(8, 8))
plt.imshow(char_cooccur_positve_perc)
plt.grid(False)
In [0]:
# character co-occurence count
# char_cooccur_count_normalized = np.zeros(char_cooccur_count.shape)
# for i in range(num_character):
#   for j in range(i, num_character):
#     if i == j:
#       char_cooccur_count_normalized[i, j] = 1
#       continue
#     char_cooccur_count_normalized[i, j] = char_cooccur_count_normalized[j, i] = char_cooccur_count[i, j] / char_cooccur_count[i, i]
char_cooccur_count_normalized = char_cooccur_count / char_cooccur_count.diagonal()[:,None]

plt.figure(figsize=(8, 8))
plt.imshow(np.tril(char_cooccur_count, -1))
plt.grid(False)
In [0]:
# save to local
csv_root_dir = '/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/'
np.savetxt(csv_root_dir + 'char_cooccur_count.csv', char_cooccur_count, delimiter=',')
np.savetxt(csv_root_dir + 'char_cooccur_positve_perc.csv', char_cooccur_positve_perc, delimiter=',')
np.savetxt(csv_root_dir + 'char_cooccur_score.csv', char_cooccur_score, delimiter=',')
In [0]:
# load from local
csv_root_dir = '/Workspace/Repos/Shared/fall-2022-reddit-big-data-project-project-group-1/data/csv/'
char_cooccur_count = np.loadtxt(csv_root_dir + 'char_cooccur_count.csv', delimiter=',')
char_cooccur_positve_perc = np.loadtxt(csv_root_dir + 'char_cooccur_positve_perc.csv', delimiter=',')
char_cooccur_score = np.loadtxt(csv_root_dir + 'char_cooccur_score.csv', delimiter=',')
In [0]:
# function that capitialize the character names
def capitalize(char_name):
  if ' ' in char_name:
    result = ' '.join([x.capitalize() for x in char_name.split(' ')])
  elif '-' in char_name:
    result = '-'.join([x.capitalize() for x in char_name.split('-')])
  elif len(char_name) == 2:
    result = char_name.upper()
  else:
    result = char_name.capitalize()
  return result
In [0]:
# write result to txt
result_txt = ''
for i in range(num_character):
  for j in range(num_character):
    tmp = char_cooccur_positve_perc[i, j]
    result_txt += f'[{num_character-i-1}, {j}, {"{:.3f}".format(tmp)}],\n'

with open('result.txt', 'w') as fID:
  fID.write(result_txt)
In [0]:
# generate network (tried but failed)
from pyvis.network import Network

# max_coocur_count = np.max(char_cooccur_count[np.triu_indices(num_character)])

net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", neighborhood_highlight=True, filter_menu=True)
characters = [c.split(':')[-1] for c in character_columns]

net.add_nodes(range(num_character), value=char_cooccur_count.diagonal(), label=[capitalize(c) for c in characters])

for i in range(num_character):
  for j in range(i, num_character):
    if i == j:
      continue
    net.add_edge(i, j, hidden=False, physics=False, value=char_cooccur_count[i, j])