from pyspark.sql.functions import *


# This line was used to make sure that we can access the parquet files
# dbutils.fs.ls("abfss://anly502@marckvaismanblob.dfs.core.windows.net/reddit/parquet/comments")


# Read all submissions and comments
submissions = spark.read.parquet("abfss://anly502@marckvaismanblob.dfs.core.windows.net/reddit/parquet/submissions")
comments = spark.read.parquet("abfss://anly502@marckvaismanblob.dfs.core.windows.net/reddit/parquet/comments")


# Retrieve submissions and coments of subredit channel r/marvelstudio 
marvel_sub = submissions.filter(col("subreddit")=="marvelstudios")
marvel_com = comments.filter(col("subreddit")=="marvelstudios")


# Report the basic info about submission dataset
# Check submissions dataframe shape
print('Marvel studio submissions dataframe shape:', (marvel_sub.count(), len(marvel_sub.columns)))
# Report dataframe schema
print('=' * 30)
marvel_sub.printSchema()

Marvel studio submissions dataframe shape: (185010, 68)
==============================
root
 |-- adserver_click_url: string (nullable = true)
 |-- adserver_imp_pixel: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- brand_safe: boolean (nullable = true)
 |-- contest_mode: boolean (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- crosspost_parent: string (nullable = true)
 |-- crosspost_parent_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- approved_at_utc: string (nullable = true)
 |    |    |-- approved_by: string (nullable = true)
 |    |    |-- archived: boolean (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- author_flair_css_class: string (nullable = true)
 |    |    |-- author_flair_text: string (nullable = true)
 |    |    |-- banned_at_utc: string (nullable = true)
 |    |    |-- banned_by: string (nullable = true)
 |    |    |-- brand_safe: boolean (nullable = true)
 |    |    |-- can_gild: boolean (nullable = true)
 |    |    |-- can_mod_post: boolean (nullable = true)
 |    |    |-- clicked: boolean (nullable = true)
 |    |    |-- contest_mode: boolean (nullable = true)
 |    |    |-- created: double (nullable = true)
 |    |    |-- created_utc: double (nullable = true)
 |    |    |-- distinguished: string (nullable = true)
 |    |    |-- domain: string (nullable = true)
 |    |    |-- downs: long (nullable = true)
 |    |    |-- edited: boolean (nullable = true)
 |    |    |-- gilded: long (nullable = true)
 |    |    |-- hidden: boolean (nullable = true)
 |    |    |-- hide_score: boolean (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- is_crosspostable: boolean (nullable = true)
 |    |    |-- is_reddit_media_domain: boolean (nullable = true)
 |    |    |-- is_self: boolean (nullable = true)
 |    |    |-- is_video: boolean (nullable = true)
 |    |    |-- likes: string (nullable = true)
 |    |    |-- link_flair_css_class: string (nullable = true)
 |    |    |-- link_flair_text: string (nullable = true)
 |    |    |-- locked: boolean (nullable = true)
 |    |    |-- media: string (nullable = true)
 |    |    |-- mod_reports: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- num_comments: long (nullable = true)
 |    |    |-- num_crossposts: long (nullable = true)
 |    |    |-- num_reports: string (nullable = true)
 |    |    |-- over_18: boolean (nullable = true)
 |    |    |-- parent_whitelist_status: string (nullable = true)
 |    |    |-- permalink: string (nullable = true)
 |    |    |-- pinned: boolean (nullable = true)
 |    |    |-- quarantine: boolean (nullable = true)
 |    |    |-- removal_reason: string (nullable = true)
 |    |    |-- report_reasons: string (nullable = true)
 |    |    |-- saved: boolean (nullable = true)
 |    |    |-- score: long (nullable = true)
 |    |    |-- secure_media: string (nullable = true)
 |    |    |-- selftext: string (nullable = true)
 |    |    |-- selftext_html: string (nullable = true)
 |    |    |-- spoiler: boolean (nullable = true)
 |    |    |-- stickied: boolean (nullable = true)
 |    |    |-- subreddit: string (nullable = true)
 |    |    |-- subreddit_id: string (nullable = true)
 |    |    |-- subreddit_name_prefixed: string (nullable = true)
 |    |    |-- subreddit_type: string (nullable = true)
 |    |    |-- suggested_sort: string (nullable = true)
 |    |    |-- thumbnail: string (nullable = true)
 |    |    |-- thumbnail_height: string (nullable = true)
 |    |    |-- thumbnail_width: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- ups: long (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- user_reports: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- view_count: string (nullable = true)
 |    |    |-- visited: boolean (nullable = true)
 |    |    |-- whitelist_status: string (nullable = true)
 |-- disable_comments: boolean (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- domain_override: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- embed_type: string (nullable = true)
 |-- embed_url: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- hidden: boolean (nullable = true)
 |-- hide_score: boolean (nullable = true)
 |-- href_url: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imp_pixel: string (nullable = true)
 |-- is_crosspostable: boolean (nullable = true)
 |-- is_reddit_media_domain: boolean (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- link_flair_css_class: string (nullable = true)
 |-- link_flair_text: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- oembed: struct (nullable = true)
 |    |    |-- author_name: string (nullable = true)
 |    |    |-- author_url: string (nullable = true)
 |    |    |-- cache_age: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- html: string (nullable = true)
 |    |    |-- provider_name: string (nullable = true)
 |    |    |-- provider_url: string (nullable = true)
 |    |    |-- thumbnail_height: long (nullable = true)
 |    |    |-- thumbnail_url: string (nullable = true)
 |    |    |-- thumbnail_width: long (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- version: string (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- reddit_video: struct (nullable = true)
 |    |    |-- dash_url: string (nullable = true)
 |    |    |-- duration: long (nullable = true)
 |    |    |-- fallback_url: string (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- hls_url: string (nullable = true)
 |    |    |-- is_gif: boolean (nullable = true)
 |    |    |-- scrubber_media_url: string (nullable = true)
 |    |    |-- transcoding_status: string (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- type: string (nullable = true)
 |-- media_embed: struct (nullable = true)
 |    |-- content: string (nullable = true)
 |    |-- height: long (nullable = true)
 |    |-- scrolling: boolean (nullable = true)
 |    |-- width: long (nullable = true)
 |-- mobile_ad_url: string (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- num_crossposts: long (nullable = true)
 |-- original_link: string (nullable = true)
 |-- over_18: boolean (nullable = true)
 |-- parent_whitelist_status: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- pinned: boolean (nullable = true)
 |-- post_hint: string (nullable = true)
 |-- preview: struct (nullable = true)
 |    |-- enabled: boolean (nullable = true)
 |    |-- images: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |-- variants: struct (nullable = true)
 |    |    |    |    |-- gif: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |-- mp4: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |-- nsfw: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |-- obfuscated: struct (nullable = true)
 |    |    |    |    |    |-- resolutions: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |    |-- width: long (nullable = true)
 |    |    |    |    |    |-- source: struct (nullable = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- url: string (nullable = true)
 |    |    |    |    |    |    |-- width: long (nullable = true)
 |-- promoted: boolean (nullable = true)
 |-- promoted_by: string (nullable = true)
 |-- promoted_display_name: string (nullable = true)
 |-- promoted_url: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- secure_media: struct (nullable = true)
 |    |-- event_id: string (nullable = true)
 |    |-- oembed: struct (nullable = true)
 |    |    |-- author_name: string (nullable = true)
 |    |    |-- author_url: string (nullable = true)
 |    |    |-- cache_age: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- html: string (nullable = true)
 |    |    |-- provider_name: string (nullable = true)
 |    |    |-- provider_url: string (nullable = true)
 |    |    |-- thumbnail_height: long (nullable = true)
 |    |    |-- thumbnail_url: string (nullable = true)
 |    |    |-- thumbnail_width: long (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- version: string (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- type: string (nullable = true)
 |-- secure_media_embed: struct (nullable = true)
 |    |-- content: string (nullable = true)
 |    |-- height: long (nullable = true)
 |    |-- media_domain_url: string (nullable = true)
 |    |-- scrolling: boolean (nullable = true)
 |    |-- width: long (nullable = true)
 |-- selftext: string (nullable = true)
 |-- spoiler: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- suggested_sort: string (nullable = true)
 |-- third_party_trackers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- third_party_tracking: string (nullable = true)
 |-- third_party_tracking_2: string (nullable = true)
 |-- thumbnail: string (nullable = true)
 |-- thumbnail_height: long (nullable = true)
 |-- thumbnail_width: long (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- whitelist_status: string (nullable = true)


# Report the basic info about submission dataset
# Check comments dataframe shape
print('Marvel studio comments dataframe shape:', (marvel_com.count(), len(marvel_com.columns)))
# Report dataframe schema
print('=' * 30)
marvel_com.printSchema()

Marvel studio comments dataframe shape: (5339267, 21)
==============================
root
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)


# Check timestamps of when the first and last submissions were created
print('First submission created_utc (UNIX):', marvel_sub.select("created_utc").sort("created_utc").head(1))
print('Last  submission created_utc (UNIX):', marvel_sub.select("created_utc").sort("created_utc").tail(1))

First submission created_utc (UNIX): [Row(created_utc=1609459575)]
Last  submission created_utc (UNIX): [Row(created_utc=1661989603)]


# Convert UNIX timestamp into human-readable date to check the time period
import datetime

print('First submission creation datetime:', datetime.datetime.fromtimestamp(1609459575))
print('Last  submission creation datetime:', datetime.datetime.fromtimestamp(1661989603))

First submission creation datetime: 2021-01-01 00:06:15
Last  submission creation datetime: 2022-08-31 23:46:43


# Extract the columns related to our business goals
keep_var_list_sub = ["id", "author", "created_utc", "title", "selftext", "score"]
keep_var_list_com = ["link_id", "author", "created_utc", "body", "score"]
data_sub = marvel_sub.select(keep_var_list_sub)
data_com = marvel_com.select(keep_var_list_com)


# Get the Statistics for submissions selected columns
data_sub.describe().show()
# Get the Statistics for comments selected columns
data_com.describe().show()

+-------+------+--------------------+--------------------+--------------------+--------+------------------+
|summary|    id|              author|         created_utc|               title|selftext|             score|
+-------+------+--------------------+--------------------+--------------------+--------+------------------+
|  count|185010|              185010|              185010|              185010|  185010|            185010|
|   mean|  null|1.362439414093085...| 1.635125253135771E9|        1.02621215E9|    null|287.00579968650345|
| stddev|  null|7.600777962613307...|1.5134434001603175E7|2.2829286854864154E9|    null|2238.2628136980743|
|    min|kj2pt6|         ----Dongers|          1609459575|                   !|        |                 0|
|    max|x2seos|          zzzzzlllll|          1661989603|                  🫥|      🥲|            108284|
+-------+------+--------------------+--------------------+--------------------+--------+------------------+

+-------+---------+-------------------+--------------------+-------+------------------+
|summary|  link_id|             author|         created_utc|   body|             score|
+-------+---------+-------------------+--------------------+-------+------------------+
|  count|  5339267|            5339267|             5339267|5339267|           5339267|
|   mean|     null|           Infinity| 1.637007490536831E9|    NaN|20.301143022066512|
| stddev|     null|                NaN|1.4839344100607885E7|    NaN| 173.8281117670103|
|    min|t3_10fxh5|---------V---------|          1609459246|       |              -940|
|    max| t3_xb7ba|          zzzzhzhzh|          1661990390|     🫶|             23891|
+-------+---------+-------------------+--------------------+-------+------------------+


# Check selected dataframe schema for submissions
print(f'{"submissions":=^30}')
data_sub.printSchema()
# Check selected dataframe schema for comments
print(f'{"comments":=^30}')
data_com.printSchema()

=========submissions==========
root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- score: long (nullable = true)

===========comments===========
root
 |-- link_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- body: string (nullable = true)
 |-- score: long (nullable = true)


# Find the columns that contain null values and count the number of null values in each column
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in data_sub.columns]).show()
data_com.select([count(when(col(c).isNull(), c)).alias(c) for c in data_com.columns]).show()

+---+------+-----------+-----+--------+-----+
| id|author|created_utc|title|selftext|score|
+---+------+-----------+-----+--------+-----+
|  0|     0|          0|    0|       0|    0|
+---+------+-----------+-----+--------+-----+

+-------+------+-----------+----+-----+
|link_id|author|created_utc|body|score|
+-------+------+-----------+----+-----+
|      0|     0|          0|   0|    0|
+-------+------+-----------+----+-----+


# Basic data cleaning for submission
# Concatenating title & selftext in submission dataframe into a new column named - "content"
# create a new column named is_submission to indicate that all rows in this dataframe are submissions
# also turn all texts to lowercase
df_sub = data_sub.select("id", "author", "created_utc", lower(concat(col("title"), lit(" "), col("selftext"))).alias("content"), "score", lit(True).alias("is_submission"))
df_sub.show(5)
# create a new column named is_submission to indicate that all rows in this dataframe are comments (not submission)
# also turn all texts to lowercase
df_com = data_com.withColumn("is_submission", lit(False)).withColumn("body", lower(col("body")))
df_com.show(5)

+------+-----------+-----------+--------------------+-----+-------------+
|    id|     author|created_utc|             content|score|is_submission|
+------+-----------+-----------+--------------------+-----+-------------+
|lqctf4|  [deleted]| 1614066815|i am inedible??? ...|    9|         true|
|lqcztv|kurtxie2020| 1614067484|i put wandavision...|  394|         true|
|lqd0ja|Barnesy91UK| 1614067556|what else has ? c...|    1|         true|
|lqd0y5|      cw1tt| 1614067598|if we are moving ...|    8|         true|
|lqd6h4|JevonJ_Haha| 1614068143|i cant wait until...|    1|         true|
+------+-----------+-----------+--------------------+-----+-------------+
only showing top 5 rows

+---------+--------------+-----------+--------------------+-----+-------------+
|  link_id|        author|created_utc|                body|score|is_submission|
+---------+--------------+-----------+--------------------+-----+-------------+
|t3_o652jl|   SIacktivist| 1624437159|agreed, but maybe...|   14|        false|
|t3_o679zl|  Milla4Prez66| 1624437163|it was. according...|    3|        false|
|t3_o64xd5|    BryanRazor| 1624437167|with the new epis...|    2|        false|
|t3_o679zl|GhostPandaBear| 1624437177|i’m so glad the t...|    1|        false|
|t3_o679zl|    MaGesticSC| 1624437180|a semantic argume...|    5|        false|
+---------+--------------+-----------+--------------------+-----+-------------+
only showing top 5 rows


# For submissions dataframe

from pyspark.sql.types import IntegerType

# Convert the UNIX time to human readable timestamps and change column data types
output_sub = df_sub.withColumn("created_utc", to_timestamp(from_unixtime("created_utc"))).withColumn("score", col("score").cast(IntegerType()))
# Rename "created_utc" to "created_ts"
output_sub = output_sub.withColumnRenamed("created_utc", "created_ts")
output_sub.show(5)

+------+-----------+-------------------+--------------------+-----+-------------+
|    id|     author|         created_ts|             content|score|is_submission|
+------+-----------+-------------------+--------------------+-----+-------------+
|lqctf4|  [deleted]|2021-02-23 07:53:35|i am inedible??? ...|    9|         true|
|lqcztv|kurtxie2020|2021-02-23 08:04:44|i put wandavision...|  394|         true|
|lqd0ja|Barnesy91UK|2021-02-23 08:05:56|what else has ? c...|    1|         true|
|lqd0y5|      cw1tt|2021-02-23 08:06:38|if we are moving ...|    8|         true|
|lqd6h4|JevonJ_Haha|2021-02-23 08:15:43|i cant wait until...|    1|         true|
+------+-----------+-------------------+--------------------+-----+-------------+
only showing top 5 rows


# For comments dataframe

# Convert the UNIX time to human readable timestamps and change column data types
output_com = df_com.withColumn('created_utc', to_timestamp(from_unixtime("created_utc"))).withColumn("score", col("score").cast(IntegerType()))
# Rename "link_id" to "id", "created_utc" to "created_ts", and "body" to "content"
output_com = output_com.withColumnRenamed("link_id", "id").withColumnRenamed("created_utc", "created_ts").withColumnRenamed("body", "content")
output_com.show(5)

+---------+--------------+-------------------+--------------------+-----+-------------+
|       id|        author|         created_ts|             content|score|is_submission|
+---------+--------------+-------------------+--------------------+-----+-------------+
|t3_o652jl|   SIacktivist|2021-06-23 08:32:39|agreed, but maybe...|   14|        false|
|t3_o679zl|  Milla4Prez66|2021-06-23 08:32:43|it was. according...|    3|        false|
|t3_o64xd5|    BryanRazor|2021-06-23 08:32:47|with the new epis...|    2|        false|
|t3_o679zl|GhostPandaBear|2021-06-23 08:32:57|i’m so glad the t...|    1|        false|
|t3_o679zl|    MaGesticSC|2021-06-23 08:33:00|a semantic argume...|    5|        false|
+---------+--------------+-------------------+--------------------+-----+-------------+
only showing top 5 rows


# Check the cleaned dataframes' schema
print(f'{"submissions":=^30}')
output_sub.printSchema()
print(f'{"comments":=^30}')
output_com.printSchema()

=========submissions==========
root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = false)

===========comments===========
root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = false)


# Vertical stack the submissions dataframe and the comments dataframe
sub_com = output_sub.union(output_com)
sub_com.printSchema()

root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = false)


# Save processed intermediate data into parquet files
sub_com.write.parquet("/FileStore/sub_com/original_data/")


# Load processed intermediate data into parquet files
sub_com = spark.read.parquet("/FileStore/sub_com/original_data/")
sub_com_with_dummy = sub_com


# Simple cleaning of textual data <potential text cleaning step> 
# sub_com_with_dummy = sub_com_with_dummy.withColumn('content', translate('content', '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', ' '))


# Create name to alias dict
CHARACTER_NAME_DICT = {
    'black widow' : ['black widow', 'natasha romanoff'],
    'yelena belova': ['yelena belova'],
    'shang-chi': ['shang-chi', 'shaun', 'shang chi'],
    'sersi': ['sersi'],
    'ikaris': ['ikaris'],
    'thena': ['thena'],
    'ajak': ['ajak'],
    'spider-man': ['spider-man', 'spider man', 'spiderman', 'peter parker'],
    'doctor strange': ['doctor strange', 'strange supreme', 'stephen strange'],
    'electro': ['max dillon', 'electro'],
    'green goblin': ['norman osborn', 'green goblin'],
    'doc ock': ['otto octavius', 'doc ock', 'doctor octopus'],
    'wong': ['wong'],
    'wanda': ['wanda', 'scarlet witch'],
    'thor': ['Thor', 'god of thunder'],
    'jane foster': ['jane foster', 'the mighty thor'],
    'gorr': ['gorr'],
    'vision': ['vision'],
    'agnes': ['agnes', 'agatha harkness'],
    'falcon': ['falcon', 'sam wilson'],
    'bucky': ['bucky', 'winter soldier'],
    'john walker': ['john walker'],
    'captain america' : ['captain america', 'steve rogers'],
    'loki': ['loki'],
    'casey': ['casey', 'hunter k-5e', 'hunter k 5e'],
    'the watcher': ['the watcher'],
    'nick fury': ['nick fury'],
    'iron man': ['iron man', 'tony stark'],
    'hawkeye': ['hawkeye', 'clint barton', 'ronin'],
    'ultron': ['ultron'],
    'red skull': ['red skull'],
    'captain marvel': ['captain marvel', 'carol danvers'],
    'captain carter': ['captain carter', 'peggy carter'],
    'hulk': ['bruce banner', 'hulk'],
    'nebula': ['nebula'],
    'hank pym': ['hank pym', 'yellow jacket'],
    'ant-man': ['ant-man', 'antman', 'ant man', 'scott lang'],
    'thanos': ['thanos'],
    'kate bishop': ['kate bishop'],
    'kingpin': ['kingpin'],
    'moon knight': ['moon knight', 'marc spector', 'steven grant', 'mr. knight', 'mr knight', 'jake lockley'],
    'arthur harrow': ['arthur harrow'],
    'kamala khan': ['kamala khan'],
    'she-hulk': ['jennifer walters', 'she-hulk', 'shehulk', 'she hulk'],
    'abomination': ['abomination', 'emil blonsky'],
    'odin': ['odin'],
    'pepper potts': ['pepper potts'],
    'mj': ['michelle', 'mj'],
    'ned': ['ned leeds', 'ned'],
    'happy': ['happy', 'happy hogan']
}

MOVIE_NAME_DICT = {
    'black widow' : ['black widow', 'blackwidow'],
    'shang-chi': ['shang-chi', 'shang chi'], 
    'eternals': ['eternals'],
    'spider-man': ['spider-man', 'spiderman', 'spider man'],
    'doctor strange': ['doctor strange'],
    'thor': ['thor']
}

SERIES_NAME_DICT = {
    'wandavision': ['wandavision'],
    'the falcon and the winter soldier': ['the falcon and the winter soldier'],
    'loki': ['loki'],
    'what if': ['what if'],
    'hawkeye': ['hawkeye'], 
    'moon knight': ['moon knight'],
    'ms marvel': ['ms marvel', 'ms. marvel'],
    'she-hulk': ['she-hulk', 'shehulk', 'she hulk']
}

# Add new colums to sub_com_with_dummy dataframe to show whether character / mdeia was mentioned in the content
for key in CHARACTER_NAME_DICT:
    values = CHARACTER_NAME_DICT[key]
    pattern = "|".join(['( '+name+')' for name in values])
    sub_com_with_dummy = sub_com_with_dummy.withColumn('character:'+key, col("content").rlike(pattern))
    
for key in MOVIE_NAME_DICT:
    values = MOVIE_NAME_DICT[key]
    pattern = "|".join(['( '+name+')' for name in values])
    sub_com_with_dummy = sub_com_with_dummy.withColumn('movie:'+key, col("content").rlike(pattern))

for key in SERIES_NAME_DICT:
    values = SERIES_NAME_DICT[key]
    pattern = "|".join(['( '+name+')' for name in values])
    sub_com_with_dummy = sub_com_with_dummy.withColumn('series:'+key, col("content").rlike(pattern))


sub_com_with_dummy.printSchema()

root
 |-- id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- created_ts: timestamp (nullable = true)
 |-- content: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- is_submission: boolean (nullable = false)
 |-- character:black widow: boolean (nullable = true)
 |-- character:yelena belova: boolean (nullable = true)
 |-- character:shang-chi: boolean (nullable = true)
 |-- character:sersi: boolean (nullable = true)
 |-- character:ikaris: boolean (nullable = true)
 |-- character:thena: boolean (nullable = true)
 |-- character:ajak: boolean (nullable = true)
 |-- character:spider-man: boolean (nullable = true)
 |-- character:doctor strange: boolean (nullable = true)
 |-- character:electro: boolean (nullable = true)
 |-- character:green goblin: boolean (nullable = true)
 |-- character:doc ock: boolean (nullable = true)
 |-- character:wong: boolean (nullable = true)
 |-- character:wanda: boolean (nullable = true)
 |-- character:thor: boolean (nullable = true)
 |-- character:jane foster: boolean (nullable = true)
 |-- character:gorr: boolean (nullable = true)
 |-- character:vision: boolean (nullable = true)
 |-- character:agnes: boolean (nullable = true)
 |-- character:falcon: boolean (nullable = true)
 |-- character:bucky: boolean (nullable = true)
 |-- character:john walker: boolean (nullable = true)
 |-- character:captain america: boolean (nullable = true)
 |-- character:loki: boolean (nullable = true)
 |-- character:casey: boolean (nullable = true)
 |-- character:the watcher: boolean (nullable = true)
 |-- character:nick fury: boolean (nullable = true)
 |-- character:iron man: boolean (nullable = true)
 |-- character:hawkeye: boolean (nullable = true)
 |-- character:ultron: boolean (nullable = true)
 |-- character:red skull: boolean (nullable = true)
 |-- character:captain marvel: boolean (nullable = true)
 |-- character:captain carter: boolean (nullable = true)
 |-- character:hulk: boolean (nullable = true)
 |-- character:nebula: boolean (nullable = true)
 |-- character:hank pym: boolean (nullable = true)
 |-- character:ant-man: boolean (nullable = true)
 |-- character:thanos: boolean (nullable = true)
 |-- character:kate bishop: boolean (nullable = true)
 |-- character:kingpin: boolean (nullable = true)
 |-- character:moon knight: boolean (nullable = true)
 |-- character:arthur harrow: boolean (nullable = true)
 |-- character:kamala khan: boolean (nullable = true)
 |-- character:she-hulk: boolean (nullable = true)
 |-- character:abomination: boolean (nullable = true)
 |-- character:odin: boolean (nullable = true)
 |-- character:pepper potts: boolean (nullable = true)
 |-- character:mj: boolean (nullable = true)
 |-- character:ned: boolean (nullable = true)
 |-- character:happy: boolean (nullable = true)
 |-- movie:black widow: boolean (nullable = true)
 |-- movie:shang-chi: boolean (nullable = true)
 |-- movie:eternals: boolean (nullable = true)
 |-- movie:spider-man: boolean (nullable = true)
 |-- movie:doctor strange: boolean (nullable = true)
 |-- movie:thor: boolean (nullable = true)
 |-- series:wandavision: boolean (nullable = true)
 |-- series:the falcon and the winter soldier: boolean (nullable = true)
 |-- series:loki: boolean (nullable = true)
 |-- series:what if: boolean (nullable = true)
 |-- series:hawkeye: boolean (nullable = true)
 |-- series:moon knight: boolean (nullable = true)
 |-- series:ms marvel: boolean (nullable = true)
 |-- series:she-hulk: boolean (nullable = true)


# Save processed intermediate data into parquet files
sub_com_with_dummy.write.parquet("/FileStore/sub_com/with_dummy/")


# Load processed intermediate data into parquet files
sub_com_with_dummy = spark.read.parquet("/FileStore/sub_com/with_dummy/")


# run regex and count the number of times each character was metioned by either submissions or comments
character_count_dict = {c: sub_com_with_dummy.filter(col(c)).count() for c in sub_com_with_dummy.columns if c.split(':')[0] == "character"}
character_count_dict

Out[269]: {'character:black widow': 42108,
 'character:yelena belova': 626,
 'character:shang-chi': 44257,
 'character:sersi': 5023,
 'character:ikaris': 6080,
 'character:thena': 3218,
 'character:ajak': 2355,
 'character:spider-man': 146361,
 'character:doctor strange': 34125,
 'character:electro': 7637,
 'character:green goblin': 8265,
 'character:doc ock': 6793,
 'character:wong': 15944,
 'character:wanda': 186333,
 'character:thor': 1262,
 'character:jane foster': 3146,
 'character:gorr': 10177,
 'character:vision': 69443,
 'character:agnes': 7943,
 'character:falcon': 33047,
 'character:bucky': 85656,
 'character:john walker': 7848,
 'character:captain america': 56294,
 'character:loki': 133711,
 'character:casey': 536,
 'character:the watcher': 6117,
 'character:nick fury': 8173,
 'character:iron man': 73849,
 'character:hawkeye': 49639,
 'character:ultron': 48224,
 'character:red skull': 5274,
 'character:captain marvel': 40279,
 'character:captain carter': 6390,
 'character:hulk': 89157,
 'character:nebula': 8061,
 'character:hank pym': 3755,
 'character:ant-man': 32571,
 'character:thanos': 100575,
 'character:kate bishop': 5404,
 'character:kingpin': 15308,
 'character:moon knight': 26959,
 'character:arthur harrow': 269,
 'character:kamala khan': 1843,
 'character:she-hulk': 20144,
 'character:abomination': 6841,
 'character:odin': 14600,
 'character:pepper potts': 713,
 'character:mj': 25721,
 'character:ned': 10208,
 'character:happy': 33374}


# run regex and count the number of times each movie was metioned by either submissions or comments
movie_count_dict = {c: sub_com_with_dummy.filter(col(c)).count() for c in sub_com_with_dummy.columns if c.split(':')[0] == "movie"}
movie_count_dict

Out[270]: {'movie:black widow': 41987,
 'movie:shang-chi': 44014,
 'movie:eternals': 49710,
 'movie:spider-man': 136529,
 'movie:doctor strange': 31266,
 'movie:thor': 132231}


# run regex and count the number of times each series was metioned by either submissions or comments
series_count_dict = {c: sub_com_with_dummy.filter(col(c)).count() for c in sub_com_with_dummy.columns if c.split(':')[0] == "series"}
series_count_dict

Out[271]: {'series:wandavision': 59422,
 'series:the falcon and the winter soldier': 2479,
 'series:loki': 133711,
 'series:what if': 40495,
 'series:hawkeye': 45844,
 'series:moon knight': 25062,
 'series:ms marvel': 20818,
 'series:she-hulk': 19844}


# Save the count result into dictionaries
import json

with open("./data/json/character_count.json", "w") as fID:
    json.dump(character_count_dict, fID)

with open("./data/json/movie_count.json", "w") as fID:
    json.dump(movie_count_dict, fID)
    
with open("./data/json/series_count.json", "w") as fID:
    json.dump(series_count_dict, fID)


# Load the character info json file from a REST API
import requests
import json

# Get the character info from API
api = 'https://akabab.github.io/superhero-api/api/all.json'
data_char = requests.get(api).text
parsed = json.loads(data_char)

# Select the info that we need in the analysis
hero_ls = []
col_ls = ['name', 'intelligence', 'strength', 'speed', 'durability', 'power', 'combat', 'gender', 'race', 'first_name', 'last_name', 'publisher', 'alignment']
for i in range(len(parsed)):
    info = []
    info.append(parsed[i]['name'].lower())
    info.append(parsed[i]['powerstats']['intelligence'])
    info.append(parsed[i]['powerstats']['strength'])
    info.append(parsed[i]['powerstats']['speed'])
    info.append(parsed[i]['powerstats']['durability'])
    info.append(parsed[i]['powerstats']['power'])
    info.append(parsed[i]['powerstats']['combat'])
    info.append(parsed[i]['appearance']['gender'])
    info.append(parsed[i]['appearance']['race'])
    if len(parsed[i]['biography']['fullName'].split(' ')) < 2:
        info.append(parsed[i]['biography']['fullName'].lower())
        info.append(parsed[i]['biography']['fullName'].lower())
    else:
        info.append(parsed[i]['biography']['fullName'].split(' ')[0].lower())
        info.append(parsed[i]['biography']['fullName'].split(' ')[1].lower())
    info.append(parsed[i]['biography']['publisher'])
    info.append(parsed[i]['biography']['alignment'])
    hero_ls.append(info)


# Create a spark dataframe based on the loaded json data
df_char = spark.createDataFrame(hero_ls, col_ls)


# Show the schema of the df
df_char.printSchema()

root
 |-- name: string (nullable = true)
 |-- intelligence: long (nullable = true)
 |-- strength: long (nullable = true)
 |-- speed: long (nullable = true)
 |-- durability: long (nullable = true)
 |-- power: long (nullable = true)
 |-- combat: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- race: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- alignment: string (nullable = true)


# Load char count json file
with open('./data/json/character_count.json', 'r') as fID:
    char_count = json.loads(fID.read())


# Get the name and count
char_name_ls = [name[10:] for name in list(char_count.keys())]
char_count_ls = list(char_count.values())
char_count_pair = [[char_name_ls[i], char_count_ls[i]] for i in range(len(char_name_ls))]
char_we_need = spark.createDataFrame(char_count_pair, ['name', 'mention_count'])
# Merge the char info and the char count
df_char_new = df_char.withColumn('name', when(col('name') == 'ant-man ii', 'ant-man').when(col('name') == 'hawkeye ii', 'kate bishop').otherwise(df_char.name))
df_char_new = df_char_new.filter((col('publisher') != 'DC Comics') & (col('publisher') != 'Giant-Man'))
cond = [(df_char_new.name == char_we_need.name) | (df_char_new.first_name == char_we_need.name) | (df_char_new.last_name == char_we_need.name)]
col_need = ['b.name', 'intelligence', 'strength', 'speed', 'durability', 'power', 'combat', 'gender', 'race', 'first_name', 'last_name', 'publisher', 'alignment', 'mention_count']
df_char_new = df_char_new.alias('a').join(char_we_need.alias('b'), cond, how = 'right').select(col_need)


# Select needed columns
col_ls = ['name', 'intelligence', 'strength', 'speed', 'durability', 'power', 'combat', 'gender', 'race', 'alignment', 'mention_count']
df_character = df_char_new.select(col_ls)


# Load the external media information data
media_df = spark.read.option("header", True).csv('dbfs:/FileStore/media_info.csv')


media_df.show(5)

+--------------------+------------+--------+----------------+----------------+
|          media_name|release_date|is_movie|box_office (USD)|investment (USD)|
+--------------------+------------+--------+----------------+----------------+
|         wandavision|  01-15-2021|   FALSE|            null|       200000000|
|the falcon and th...|  03-19-2021|   FALSE|            null|       150000000|
|                loki|  06-09-2021|   FALSE|            null|       225000000|
|         black widow|  07-09-2021|    TRUE|       379751655|       200000000|
|             what if|  08-11-2021|   FALSE|            null|       150000000|
+--------------------+------------+--------+----------------+----------------+
only showing top 5 rows


# Print the schema of loaded media information
media_df.printSchema()

root
 |-- media_name: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- is_movie: string (nullable = true)
 |-- box_office (USD): string (nullable = true)
 |-- investment (USD): string (nullable = true)


# Change data type
from pyspark.sql.types import StringType,BooleanType,DateType
media_info = media_df.withColumn('is_movie', col('is_movie').cast(BooleanType()))\
                   .withColumn('box_office (USD)', col('box_office (USD)').cast('long'))\
                   .withColumn('investment (USD)', col('investment (USD)').cast('long'))\
                   .withColumn('release_date', to_date(col('release_date'),"MM-dd-yyyy"))


media_info.printSchema()

root
 |-- media_name: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- is_movie: boolean (nullable = true)
 |-- box_office (USD): long (nullable = true)
 |-- investment (USD): long (nullable = true)


# Import the previously created json file of count of submissions and comments that mentioned each movie
movie_count = [{"black widow": 41987, "shang-chi": 23290, "eternals": 49710, "spider-man": 136529, "doctor strange": 31266, "thor": 132231}]
df_movie_count = spark.createDataFrame(movie_count)
df_movie_count.show()

+-----------+--------------+--------+---------+----------+------+
|black widow|doctor strange|eternals|shang-chi|spider-man|  thor|
+-----------+--------------+--------+---------+----------+------+
|      41987|         31266|   49710|    23290|    136529|132231|
+-----------+--------------+--------+---------+----------+------+


# Pivot the count table
df_movie_count = df_movie_count.select(expr("stack(6, 'black widow', `black widow`, 'doctor strange', `doctor strange`, 'eternals', `eternals`, 'shang-chi', `shang-chi`, 'spider-man', `spider-man`, 'thor', `thor`) as (media_name, mention_count)"))
df_movie_count.show()

+--------------+-------------+
|    media_name|mention_count|
+--------------+-------------+
|   black widow|        41987|
|doctor strange|        31266|
|      eternals|        49710|
|     shang-chi|        23290|
|    spider-man|       136529|
|          thor|       132231|
+--------------+-------------+


# Import the previously created json file of count of submissions and comments that mentioned each series
series_count = [{"wandavision": 59422, "the falcon and the winter soldier": 2479, "loki": 133711, "what if": 40495, "hawkeye": 45844, "moon knight": 25062, "ms marvel": 20818, "she-hulk": 19844}]
df_series_count = spark.createDataFrame(series_count)
df_series_count.show()

+-------+------+-----------+---------+--------+---------------------------------+-----------+-------+
|hawkeye|  loki|moon knight|ms marvel|she-hulk|the falcon and the winter soldier|wandavision|what if|
+-------+------+-----------+---------+--------+---------------------------------+-----------+-------+
|  45844|133711|      25062|    20818|   19844|                             2479|      59422|  40495|
+-------+------+-----------+---------+--------+---------------------------------+-----------+-------+


#pivot the count table
df_series_count = df_series_count.select(expr("stack(8, 'wandavision', `wandavision`, 'the falcon and the winter soldier', `the falcon and the winter soldier`, 'loki', `loki`, 'what if', `what if`, 'Hawkeye', `Hawkeye`, 'moon knight', `moon knight`, 'ms marvel', `ms marvel`, 'she-hulk', `she-hulk`) as (media_name, mention_count)"))
df_series_count.show()

+--------------------+-------------+
|          media_name|mention_count|
+--------------------+-------------+
|         wandavision|        59422|
|the falcon and th...|         2479|
|                loki|       133711|
|             what if|        40495|
|             Hawkeye|        45844|
|         moon knight|        25062|
|           ms marvel|        20818|
|            she-hulk|        19844|
+--------------------+-------------+


df_media_count = df_movie_count.union(df_series_count)
df_media_count.show()

+--------------------+-------------+
|          media_name|mention_count|
+--------------------+-------------+
|         black widow|        41987|
|      doctor strange|        31266|
|            eternals|        49710|
|           shang-chi|        23290|
|          spider-man|       136529|
|                thor|       132231|
|         wandavision|        59422|
|the falcon and th...|         2479|
|                loki|       133711|
|             what if|        40495|
|             Hawkeye|        45844|
|         moon knight|        25062|
|           ms marvel|        20818|
|            she-hulk|        19844|
+--------------------+-------------+


#join the count value with media_info
media_info = media_info.join(df_media_count, "media_name", 'left')
media_info.show()

+--------------------+------------+--------+----------------+----------------+-------------+
|          media_name|release_date|is_movie|box_office (USD)|investment (USD)|mention_count|
+--------------------+------------+--------+----------------+----------------+-------------+
|            eternals|  2021-11-05|    true|       402064899|       200000000|        49710|
|                loki|  2021-06-09|   false|            null|       225000000|       133711|
|           shang-chi|  2021-09-03|    true|       432243292|       150000000|        23290|
|         black widow|  2021-07-09|    true|       379751655|       200000000|        41987|
|         moon knight|  2022-03-30|   false|            null|       150000000|        25062|
|             what if|  2021-08-11|   false|            null|       150000000|        40495|
|      doctor strange|  2022-05-06|    true|       955775804|       200000000|        31266|
|                thor|  2022-07-08|    true|       760755255|       250000000|       132231|
|the falcon and th...|  2021-03-19|   false|            null|       150000000|         2479|
|             Hawkeye|  2021-11-24|   false|            null|       150000000|        45844|
|         wandavision|  2021-01-15|   false|            null|       200000000|        59422|
|          spider-man|  2021-12-17|    true|      1916306995|       200000000|       136529|
|           ms marvel|  2022-06-08|   false|            null|       150000000|        20818|
|            she-hulk|  2022-08-17|   false|            null|       225000000|        19844|
+--------------------+------------+--------+----------------+----------------+-------------+


#Table 1:
# Count of gender
df_character = df_character.na.fill('unknown', subset = ['gender', 'alignment', 'race'])
gender_avg_count = df_character.groupby('gender').mean('mention_count')
gender_count = df_character.groupby('gender').count()
gender_count.join(gender_avg_count, on = 'gender').show()

+-------+-----+------------------+
| gender|count|avg(mention_count)|
+-------+-----+------------------+
|   Male|   23| 47444.69565217391|
| Female|    6|50388.166666666664|
|unknown|   21| 7527.571428571428|
+-------+-----+------------------+


#Table 2:
# Count of alignment
alignment_avg_count = df_character.groupby('alignment').mean('mention_count')
alignment_count = df_character.groupby('alignment').count()
alignment_count.join(alignment_avg_count, on = 'alignment').show()

+---------+-----+------------------+
|alignment|count|avg(mention_count)|
+---------+-----+------------------+
|     good|   19| 45964.63157894737|
|      bad|   10|           52022.9|
|  unknown|   21| 7527.571428571428|
+---------+-----+------------------+


#Table 3:
# Count of race 
race_avg_count = df_character.groupby('race').mean('mention_count')
race_count = df_character.groupby('race').count()
race_count.join(race_avg_count, on = 'race').show()

+-----------------+-----+------------------+
|             race|count|avg(mention_count)|
+-----------------+-----+------------------+
|            Human|   17|40576.294117647056|
|          Eternal|    1|          100575.0|
|          unknown|   22| 7425.136363636364|
|        Asgardian|    2|           67486.5|
|          Android|    2|           58833.5|
|Human / Radiation|    2|           47999.0|
|        Luphomoid|    1|            8061.0|
|       Human-Kree|    1|           40279.0|
|    God / Eternal|    1|           14600.0|
|           Mutant|    1|          186333.0|
+-----------------+-----+------------------+


#Table 4: Character
# Get a summary table on the powerstats of Marvel heros
df_character.select('intelligence', 'strength', 'speed', 'durability', 'power', 'combat', 'mention_count').summary("min", "25%", "mean", "75%", "max", "stddev").show()

+-------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|summary|      intelligence|          strength|             speed|        durability|            power|           combat|    mention_count|
+-------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|    min|                38|                 6|                 8|                10|               13|               30|              269|
|    25%|                63|                12|                23|                42|               29|               64|             5404|
|   mean| 77.24137931034483|46.172413793103445|39.172413793103445| 64.34482758620689|63.06896551724138|75.86206896551724|         31032.72|
|    75%|                90|                83|                53|                95|              100|               90|            42108|
|    max|               100|               100|                83|               100|              100|              100|           186333|
| stddev|17.887855362175983| 35.36550878783819| 19.88910140453316|29.840379955055347| 32.5762084903371| 17.7718158448608|41085.59461836338|
+-------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+


#Table 5: Media info 1
#overall profit of Marvel Studio from Movies from Jan, 2021 to Aug, 2022
sum_invest = media_info.groupby('is_movie').agg({'box_office (USD)': "sum", 'investment (USD)': "sum"})
#sum_invest = sum_invest.filter(sum_invest['is_movie'] == True)
sum_invest = sum_invest.withColumn('profit (USD)', col('sum(box_office (USD))')-col('sum(investment (USD))'))
sum_invest.show()

+--------+---------------------+---------------------+------------+
|is_movie|sum(box_office (USD))|sum(investment (USD))|profit (USD)|
+--------+---------------------+---------------------+------------+
|    true|           4846897900|           1200000000|  3646897900|
|   false|                 null|           1400000000|        null|
+--------+---------------------+---------------------+------------+


#Table 6: Media info 2
#summary of media info
media_info_pd = media_info.toPandas()
media_info_pd


# Global Configurations
import matplotlib.pyplot as plt
from cycler import cycler

plt.rcParams["font.family"] = "P052"
# plt.rcParams['axes.prop_cycle'] = cycler(color=['#a2d2ff', '#005f73', '#0a9396', '#94d2bd', '#e9d8a6', '#ca6702', '#bb3e03', '#001219'])
plt.rcParams["axes.titlesize"] = 24
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["legend.fontsize"] = 16


date_count = sub_com.withColumn("created_date", to_date(col("created_ts"))).groupBy("created_date").count()
date_count.show(5)

+------------+-----+
|created_date|count|
+------------+-----+
|  2021-12-18|21421|
|  2022-07-31| 9765|
|  2021-06-22| 3375|
|  2021-08-27|10337|
|  2022-03-28|10027|
+------------+-----+
only showing top 5 rows


date_count_pandas = date_count.toPandas()


date_count_pandas.plot(x="created_date", y="count", figsize=(16, 9), color='#005f73')
plt.xlabel("Date",  size = 20)
plt.ylabel("# of submission and comements", size = 20)
plt.title("Number of Submissions and Comments 2021 Jan - 2022 Aug", size = 25)

Out[187]: Text(0.5, 1.0, 'Number of Submissions and Comments 2021 Jan - 2022 Aug')


usr_posts_count = sub_com.groupBy("author").count()
usr_posts_count.show(5)

+------------+-----+
|      author|count|
+------------+-----+
|wickedplague|    2|
|Upintheair84|   59|
| Sweetartums|    2|
|   CTB021300|    3|
|  sable-king|  895|
+------------+-----+
only showing top 5 rows


usr_posts_count_pandas = usr_posts_count.select("count").toPandas()


three_std_gt_mean = usr_posts_count_pandas['count'].mean() + 3 * usr_posts_count_pandas['count'].std()
usr_posts_count_pandas[usr_posts_count_pandas['count'] < three_std_gt_mean].hist(figsize=(16, 9), bins=30, color='#005f73')
plt.yscale("log")
plt.xlabel("# of submission and comements",  size = 20)
plt.ylabel("log(# of users in marvelstudio subreddit)", size = 20)
plt.title("Number of Posts Per User Historgram", size = 25)

Out[189]: Text(0.5, 1.0, 'Number of Posts Per User Historgram')


# plot for char info 1
# radar plot for heros' power
import numpy as np

col_ls_v1 = ['name', 'intelligence', 'strength', 'speed', 'durability', 'power', 'combat', 'mention_count', 'alignment']
# filter 
dat_v1 = df_character.select(col_ls_v1)
dat_v1 = dat_v1.filter(dat_v1.intelligence.isNotNull())
# change to pandas
char_v1_pd = dat_v1.toPandas()
char_v1_pd.index = char_v1_pd.name

char_v1_labels = ['intelligence', 'strength', 'speed', 'durability', 'power', 'combat']
num_vars = len(char_v1_labels)

angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]


fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(polar=True), dpi = 200)

plt.rcParams["font.family"] = "P052"

# def a function to add hero that we want
def add_to_radar(hero, color):
  values = char_v1_pd.loc[hero, char_v1_labels].tolist()
  values += values[:1]
  ax.plot(angles, values, color=color, linewidth=1, label=hero)
  ax.fill(angles, values, color=color, alpha=0.25)

add_to_radar('captain america', '#001219')
add_to_radar('black widow', '#005f73')
add_to_radar('shang-chi', '#0a9396')
add_to_radar('spider-man', '#94d2bd')
add_to_radar('doctor strange', '#e9d8a6')
add_to_radar('thor', '#ca6702')
add_to_radar('iron man', '#bb3e03')

ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)

ax.set_thetagrids(np.degrees(angles)[:-1], char_v1_labels, fontsize = 13)

for label, angle in zip(ax.get_xticklabels(), angles):
  if angle in (0, np.pi):
    label.set_horizontalalignment('center')
  elif 0 < angle < np.pi:
    label.set_horizontalalignment('left')
  else:
    label.set_horizontalalignment('right')
    
ax.set_ylim(0, 100)

ax.set_rlabel_position(180 / num_vars)

ax.tick_params(colors='#222222')
# Make the y-axis (0-100) labels smaller.
ax.tick_params(axis='y', labelsize=8)
# Change the color of the circular gridlines.
ax.grid(color='#AAAAAA')
# Change the color of the outermost gridline (the spine).
ax.spines['polar'].set_color('#222222')
# Change the background color inside the circle itself.
ax.set_facecolor('#FAFAFA')

# Add title.
ax.set_title("Heros' Powerstats", y=1.15, fontsize = 20)

# Add a legend as well.
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize = 7)

Out[169]: <matplotlib.legend.Legend at 0x7fe1185917f0>


# plot for char info 2
# power, mentioned times and alignment

char_v2_pd = char_v1_pd
char_v2_pd['overall power'] = char_v2_pd['intelligence'] + char_v2_pd['speed'] + char_v2_pd['durability'] + char_v2_pd['power'] + char_v2_pd['combat']
col_v2_ls = ['overall power', 'alignment', 'count']

alist = np.array(['good', 'bad'])
alignment = char_v2_pd['alignment'].values
power = char_v2_pd['overall power'].values
count = char_v2_pd['mention_count'].values
colors = ['#ae2012', '#005f73']


fig, ax = plt.subplots(figsize=(8,8))

plt.rcParams["font.family"] = "P052"

for a, c in zip(alist, colors):
    idxs = np.where(alignment == a)
    ax.scatter(
        power[idxs], count[idxs], label = a,
        s = 80, color = c, alpha = 0.6
    )

plt.ylabel('Mentioned Times')
plt.xlabel('Overall Power')
ax.set_title("Heros' Power, Mentioned Times and Alignment", y=1.05)

ax.legend();


#transfer spark df to pandas df
media_info_pd = media_info.toPandas()


# plot for media info
# mentioned times and investment value
#colors = ['#ae2012', '#005f73']

#distribution of mention_count for each media
count = media_info_pd['mention_count']
name = media_info_pd['media_name']
invest = media_info_pd['investment (USD)']
x = range(len(name))

fig = plt.figure(figsize=(22,10))
ax1 = fig.add_subplot()
#fig, ax1 = plt.subplots(figsize=(22,10))
ax2 = ax1.twinx()

# plt.figure()
ax1.bar(x, count, alpha = 0.5, width = 0.5, label = 'Mention Counts', color = '#005f73')
ax2.scatter(x, invest, color = '#ae2012', label = 'Investment (USD Million)', s = 80, alpha=0.5)

plt.xticks(x,name)
ax1.tick_params(rotation = 90)
ax1.set_xlabel("Media Name")
ax1.set_ylabel("Mention Counts")
ax2.set_ylabel("Investment (USD Million)")
plt.title("Number of Mention Counts & Investment Value of Each Media")
ax1.legend(loc = (0.01, 0.9) )#'upper left')
ax2.legend(loc = (0.01, 0.85) )#'upper right')
plt.show()

10 Business Questions¶

Media Class¶

Character Class¶

Externel Data Link¶

Reddit Data Exploration¶

Read Reddit Data from parquet files and filter r/marvelstudio subreddit¶

Basic stats (shape, schema, etc.) of loaded dataframes¶

Collect time range of the data¶

Extract relevant columns and generate applicable dataframe¶

Basic info of extracted dataframe¶

Basic data wrangling¶

Count null values in the dataset¶

Concat/rename columns¶

Format dataframes column names and datatypes¶

Union cleaned datasets to generate a unified dataframe for future steps¶

Save the unified dataset into parquet files (intermediate dataframe)¶

Generate dummy variables indicating whether Marvel characters / movies or series are mentioned in each submission/comment¶

Create dict containing marvel characters, movies, Disney+ series as keys and their alias as values¶

Use regex to generate dummy columns to show if character/movie/series was metioned in the content for each submission/comment¶

Save the dataframe with dummy variables into parquet files (intermediate dataframe)¶

Evaluate and count for characters, movies, and Disney+ series¶

Save the result dicts to json files so that there is no need to run the above evaluations again in the future¶

External Data Exploration¶

Character Information¶

Media Information¶

Summary Table¶

Table 1¶

Table 2¶

Table 3¶

Table 4¶

Table 5¶

Table 6¶

Early Data Visualization¶

Plot 1¶

Plot 2¶

Plot 3¶

Plot 4¶

Plot 5¶

	media_name	release_date	is_movie	box_office (USD)	investment (USD)	mention_count
0	wandavision	2021-01-15	False	NaN	200000000	59422
1	the falcon and the winter soldier	2021-03-19	False	NaN	150000000	2479
2	loki	2021-06-09	False	NaN	225000000	133711
3	black widow	2021-07-09	True	3.797517e+08	200000000	41987
4	what if	2021-08-11	False	NaN	150000000	40495
5	shang-chi	2021-09-03	True	4.322433e+08	150000000	23290
6	eternals	2021-11-05	True	4.020649e+08	200000000	49710
7	Hawkeye	2021-11-24	False	NaN	150000000	45844
8	spider-man	2021-12-17	True	1.916307e+09	200000000	136529
9	moon knight	2022-03-30	False	NaN	150000000	25062
10	doctor strange	2022-05-06	True	9.557758e+08	200000000	31266
11	ms marvel	2022-06-08	False	NaN	150000000	20818
12	thor	2022-07-08	True	7.607553e+08	250000000	132231
13	she-hulk	2022-08-17	False	NaN	225000000	19844