import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import textwrap
from pyvis.network import Network
import math
contract = pd.read_csv('contract_sub.csv')
contract_info = contract.iloc[:,[56, 65, 66, 67, 69, 70, 73, 87, 88, 92]].copy()
contract_info['subaward_action_date'] = pd.to_datetime(contract_info['subaward_action_date'])
contract_info['subaward_action_year'] = contract_info['subaward_action_date'].apply(lambda x: x.year)
contract_info['subaward_action_month'] = contract_info['subaward_action_date'].apply(lambda x: x.month)
contract_info = contract_info.sort_values(by = 'subaward_action_date')
contract_info.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 348 entries, 311 to 316 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 prime_award_project_title 348 non-null object 1 subaward_number 348 non-null object 2 subaward_amount 348 non-null float64 3 subaward_action_date 348 non-null datetime64[ns] 4 subawardee_duns 348 non-null int64 5 subawardee_name 348 non-null object 6 subawardee_parent_name 243 non-null object 7 subaward_primary_place_of_performance_state_name 348 non-null object 8 subaward_primary_place_of_performance_address_zip_code 348 non-null int64 9 subaward_description 348 non-null object 10 subaward_action_year 348 non-null int64 11 subaward_action_month 348 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(4), object(6) memory usage: 35.3+ KB
## remove duplicate sub-contractors duns
company_info = contract_info[['subawardee_duns', 'subawardee_name']]
company_info = company_info.drop_duplicates()
company_duns = company_info.groupby('subawardee_duns').apply(lambda df: len(df)).reset_index()
company_duns.columns = ['subawardee_duns', 'count']
company_dupli = company_duns[company_duns['count'] > 1]['subawardee_duns']
company_info[company_info['subawardee_duns'].isin(company_dupli)]
subawardee_duns | subawardee_name | |
---|---|---|
18 | 803209782 | ORBITAL SCIENCES CORPORATION |
15 | 803209782 | ORBITAL SCIENCES LLC |
286 | 175344068 | COBHAM MISSION SYSTEMS ORCHARD PARK INC. |
287 | 175344068 | CARLETON TECHNOLOGIES INC |
184 | 2742497 | UNI-CAST, INC. |
69 | 2742497 | UNI-CAST, LLC |
98 | 787085901 | MORGAN ADVANCED CERAMICS, INC |
231 | 787085901 | MORGAN ADVANCED CERAMICS, INC. |
233 | 831282715 | INOVAR, INC. |
247 | 831282715 | SPARTRONICS LOGAN, LLC |
219 | 829171243 | DUCOMMUN LABARGE TECHNOLOGIES, INC. |
238 | 829171243 | LABARGE ACQUISITION COMPANY, INC. |
143 | 198722316 | PORT ELECTRONICS CORPORATION |
137 | 198722316 | PORT ELECTRONICS CORP. |
142 | 617275149 | GRAY'S ENGINEERING & CONSULTING, INC. |
158 | 617275149 | GRAY'S ENGINEERING & CONSULTING, LLC |
duns_need_check = [175344068, 831282715, 829171243]
# print(contract_info[contract_info['subawardee_duns'].isin(duns_need_check)].sort_values(['subawardee_duns', 'subaward_action_date']))
checked_name = ['CARLETON TECHNOLOGIES/COBHAM MISSION SYSTEMS ORCHARD PARK', 'DUCOMMUN LABARGE TECHNOLOGIES, INC.', 'INOVAR/SPARTRONICS LOGAN']
## 803209782, 2742497, 787085901, 829171243, 198722316, 617275149
## These duns have the same company name so we unify them.
unify_duns = [803209782, 2742497, 787085901, 198722316, 617275149]
unify_name = ['ORBITAL SCIENCES', 'UNI-CAST',
'MORGAN ADVANCED CERAMICS', 'PORT ELECTRONICS',
"'GRAY'S ENGINEERING & CONSULTING"]
unify_comp = pd.DataFrame({'subawardee_duns':unify_duns + duns_need_check, 'unify_name':unify_name + checked_name})
contract_info = pd.merge(contract_info, unify_comp, on = ['subawardee_duns'], how = 'left')
contract_info['subawardee_name'] = np.where(contract_info['unify_name'].isnull(), contract_info['subawardee_name'], contract_info['unify_name'])
group_l = ['subaward_action_year', 'prime_award_project_title']
contract_yearsum = contract_info.groupby(group_l).apply(lambda df: df['subaward_amount'].sum()).reset_index()
contract_yearsum.columns = ['subaward_action_year', 'prime_award_project_title', 'amount']
# colors = ['#b23a48','#d84727', '#ef7b45', '#cdedf6', '#5eb1bf', '#042a2b']
fig1 = px.bar(contract_yearsum, x = 'subaward_action_year', y = 'amount',
color = 'prime_award_project_title', hover_name = 'prime_award_project_title', template = 'seaborn',
width = 800, height = 400, hover_data = {'subaward_action_year':False, 'prime_award_project_title':False},
title = "<b>Subaward Amount in Different Years</b><br><sup>(Hover to show the company info or zoom)</sup>",
log_y = False, labels = {'subaward_action_year':'year',
'prime_award_project_title': 'project'},
color_discrete_sequence = ['#eaac8b', '#e56b6f', '#b56576', '#6d597a', '#355070', '#283d3b']
)
fig1.update_layout(font_family = "Roboto Slab", margin = {'t':50, 'l':30, 'b':50, 'r':30},
showlegend = True, plot_bgcolor = '#f5ebe0', paper_bgcolor = '#edede9',
legend = dict(font = dict(size = 8)))
fig1.show()
fig1.write_html("bar.html")
group_l = ['subawardee_name']
contract_compsum = contract_info.groupby(group_l).apply(lambda df: df['subaward_amount'].sum()).reset_index()
contract_compsum.columns = ['subawardee_name', 'amount']
fig2 = px.pie(contract_compsum, values = 'amount', names = 'subawardee_name',
title = '<b>Subawards Amount for Different Sub-contractors</b><br><sup>(Hover to show the details)</sup>',
template = 'seaborn', width = 500, height = 400,
hover_name = 'subawardee_name', hover_data = {'subawardee_name':False})
fig2.update_layout(font_family = "Roboto Slab", margin = {'t':50, 'l':30, 'b':50, 'r':30},
showlegend = False, paper_bgcolor = '#edede9')
fig2.update_traces(textposition = 'inside', textinfo = 'percent+label')
fig2.show()
fig2.write_html("pie_comp.html")
group_l = ['subaward_description']
contract_dscpsum = contract_info.groupby(group_l).apply(lambda df: df['subaward_amount'].sum()).reset_index()
contract_dscpsum.columns = ['subawardee_description', 'amount']
fig3 = px.pie(contract_dscpsum, values = 'amount', names = 'subawardee_description',
title = '<b>Subawards Amount for Different Description of Subcontracts</b><br><sup>(Hover to show the details)</sup>',
template = 'seaborn', width = 500, height = 400,
hover_name = contract_dscpsum.subawardee_description.apply(lambda txt: '<br>'.join(textwrap.wrap(txt, width=50))),
hover_data = {'subawardee_description':False},)
fig3.update_layout(font_family = "Roboto Slab", margin = {'t':50, 'l':30, 'b':50, 'r':30},
showlegend = False, paper_bgcolor = '#edede9')
fig3.update_traces(textposition = 'inside', textinfo = 'percent+label')
fig3.show()
fig3.write_html("pie_cont.html")
# group_l = ['subaward_description']
# contract_dscpcont = contract_info.groupby(group_l).apply(lambda df: len(df)).reset_index()
# contract_dscpcont.columns = ['subawardee_description', 'count']
# fig4 = px.pie(contract_dscpcont, values = 'count', names = 'subawardee_description',
# title = 'Counts of different descriptions<br><sup>(Hover to show the details)</sup>',
# template = 'seaborn', width = 400, height = 400,
# hover_name = contract_dscpsum.subawardee_description.apply(lambda txt: '<br>'.join(textwrap.wrap(txt, width=50))),
# hover_data = {'subawardee_description':False},)
# fig4.update_layout(font_family = "Roboto Slab", margin = {'t':50, 'l':30, 'b':50, 'r':30},
# showlegend = False, paper_bgcolor = '#edede9')
# fig4.update_traces(textposition = 'inside', textinfo = 'percent+label')
# fig4.show()
areas = contract_info['subaward_description'].values
products = list()
for i in range(len(areas)):
products = products + contract_info.loc[i,'subaward_description'].split(',')
for i in range(len(products)):
products[i] = products[i].replace('.', '').strip()
products = list(set(products))
products.remove('Cost Plus Fixed Fee')
products.remove('Firm Fixed Price')
products.remove('NRE - Other')
group_l = ['prime_award_project_title', 'subawardee_name', 'subaward_description']
nodes = contract_info.groupby(group_l).apply(lambda df: sum(df['subaward_amount'])).reset_index()
nodes.columns = group_l + ['amount']
projs = []
company_name = []
product_service = []
amount = []
for i in range(len(nodes)):
proj = nodes.loc[i, 'prime_award_project_title']
prods = nodes.loc[i, 'subaward_description'].split(',')
price = nodes.loc[i, 'amount'] / len(prods)
name = nodes.loc[i, 'subawardee_name']
for j in range(len(prods)):
projs.append(proj)
company_name.append(name)
product_service.append(prods[j].replace('.', '').strip())
amount.append(price)
company_area = pd.DataFrame({'project':projs, 'company_name':company_name, 'product_service':product_service, 'amount':amount})
company_prod_count = company_area.groupby(['project','company_name']).apply(lambda df: len(df)).reset_index()
company_prod_count.columns = ['project', 'company_name', 'product_count']
company_con_count = contract_info.groupby(['prime_award_project_title', 'subawardee_name']).apply(lambda df: len(df)).reset_index()
company_con_count.columns = ['project', 'company_name', 'contract_count']
company_avg = contract_info.groupby(['prime_award_project_title', 'subawardee_name']).apply(lambda df: df['subaward_amount'].mean()).reset_index()
company_avg.columns = ['project', 'company_name', 'average_amount']
company_bb_df = pd.merge(company_prod_count, company_con_count, on = ['project','company_name'], how = 'left')
company_bb_df = pd.merge(company_bb_df, company_avg, on = ['project','company_name'], how = 'left')
fig5 = px.scatter(company_bb_df, x = 'average_amount', y = 'product_count',
size = 'contract_count', color = 'project', template = 'seaborn',
width = 800, height = 400, log_x = True, hover_name = 'project',
hover_data = {'project':False, 'company_name':True},
title = "Companies's Product/Service in Different Projects<br><sup>(Hover to see more info)</sup>",
labels = {'product_count': 'product/service provided', 'average_amount': 'average amount per subaward'},
color_discrete_sequence = ['#eaac8b', '#283d3b', '#e56b6f', '#b56576', '#6d597a', '#355070'])
fig5.update_layout(font_family = "Roboto Slab", margin = {'t':50, 'l':30, 'b':50, 'r':30},
showlegend = True, plot_bgcolor = '#f5ebe0', paper_bgcolor = '#edede9',
legend = dict(font = dict(size = 8)))
fig5.show()
fig5.write_html("bubble.html")
company_name = company_area['company_name']
product_service = company_area['product_service']
nodes_df = pd.concat([company_name, product_service])
prod_amount = company_area.groupby('product_service').sum().reset_index()
prod_amount['color'] = '#f72585'
prod_amount.columns = ['node', 'weight', 'color']
comp_amount = company_area.groupby('company_name').sum().reset_index()
comp_amount['color'] = '#4361ee'
comp_amount.columns = ['node', 'weight', 'color']
node_df = pd.concat([prod_amount, comp_amount], axis = 0).reset_index(drop = True)
net = Network(height = '650px', width = '90%', bgcolor = '#edede9')
for i in range(len(node_df)):
net.add_node(node_df.loc[i, 'node'], size = math.log(node_df.loc[i, 'weight']/100), color = node_df.loc[i, 'color'])
for j in range(len(company_area)):
net.add_edge(company_area.loc[j, 'company_name'], company_area.loc[j, 'product_service'])
net.set_options("""
var options = {
"nodes": {
"shadow": {
"enabled": true
}
},
"physics": {
"barnesHut": {
"gravitationalConstant": -13200,
"centralGravity": 2.45,
"springLength": 10,
"springConstant": 0.07
},
"minVelocity": 0.75
}
}
""")
net.show('network.html')
company_count = contract_info.groupby(['subawardee_duns', 'subaward_description']).apply(lambda df: len(df)).reset_index()
company_count.columns = ['subawardee_duns', 'subaward_description', 'contract_count']
# np.unique(company_count['subawardee_duns'], return_counts = True)
(array([ 2742497, 3087715, 4128971, 4953852, 6069710, 6820633, 8385312, 8975880, 9566167, 16426298, 18481390, 20544636, 21649546, 22463140, 26249487, 27694888, 29351079, 30387286, 39115845, 39119057, 39418750, 39726455, 40707460, 42568774, 42643973, 43570142, 44239754, 45017675, 45102274, 48500888, 58215778, 58396268, 60605883, 62090931, 62546176, 75458455, 79735571, 79975806, 80242612, 84190594, 84692466, 102814464, 115022725, 117056811, 117474014, 119159270, 123083784, 131810918, 134729487, 139119536, 149433844, 154827786, 175344068, 181709775, 185340130, 189737810, 196559991, 198722316, 364015917, 603030065, 612074369, 617275149, 623422941, 623681124, 780680976, 781085055, 787085901, 788104334, 803209782, 826534286, 829171243, 831282715, 832416932, 958814345, 959852161, 961642423, 963709212]), array([2, 1, 2, 2, 2, 2, 2, 3, 3, 2, 1, 3, 2, 2, 3, 1, 3, 2, 2, 2, 1, 1, 1, 2, 1, 1, 4, 1, 5, 2, 1, 1, 1, 2, 1, 1, 1, 2, 4, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 2, 1, 1, 2, 2, 3, 2, 2, 2, 1, 2, 2, 3, 1, 2, 4, 2, 3, 3, 4, 6, 1, 5, 1, 3]))
group_l1 = ['subaward_action_year', 'subaward_action_month', 'subaward_number', 'subawardee_duns']
contract_timeline = contract_info.groupby(group_l1).apply(lambda df: df['subaward_amount'].sum()).reset_index()
contract_timeline.columns = ['subaward_action_year', 'subaward_action_month', 'subaward_number', 'subawardee_duns', 'amount']
group_l2 = ['subaward_number', 'subawardee_duns']
contract_end_date = contract_info.groupby(group_l2).apply(lambda df: df['subaward_action_date'].max()).reset_index()
contract_end_date.columns = ['subaward_number', 'subawardee_duns', 'end_date']
group_l3 = ['subaward_number', 'subawardee_duns']
contract_start_date = contract_info.groupby(group_l3).apply(lambda df: df['subaward_action_date'].min()).reset_index()
contract_start_date.columns = ['subaward_number', 'subawardee_duns', 'start_date']
contract_dates = pd.merge(contract_start_date, contract_end_date, on = ['subaward_number', 'subawardee_duns'], how = 'left')
# contract_timeline = pd.merge(contract_dates, contract_timeline, on = ['subaward_number', 'subawardee_duns'], how = 'inner')
# contract_timeline['year_month'] = contract_timeline['subaward_action_year'].astype(str) + '-' + contract_timeline['subaward_action_month'].astype(str)
# contract_timeline = contract_timeline.drop(['subaward_action_year', 'subaward_action_month'], axis = 1)
contract_total = pd.merge(contract_timeline, contract_info, on = ['subaward_number', 'subawardee_duns', 'subaward_action_year', 'subaward_action_month'], how = 'left')
contract_total['class'] = np.where(contract_total['subaward_description'] == 'Firm Fixed Price',
'Firm Fixed Price',
np.where(contract_total['subaward_description'] == 'Cost Plus Fixed Fee', 'Cost Plus Fixed Fee', 'Other'))
contract_timeline
subaward_number | subawardee_duns | start_date | end_date | subaward_action_year | subaward_action_month | amount | year_month | |
---|---|---|---|---|---|---|---|---|
0 | 373883390 | 826534286 | 2017-04-27 | 2017-04-27 | 2017 | 4 | 499468.0 | 2017-4 |
1 | 4201307555 | 43570142 | 2016-06-07 | 2016-06-07 | 2016 | 6 | 82290.0 | 2016-6 |
2 | 4201363150 | 780680976 | 2016-08-30 | 2016-09-06 | 2016 | 8 | 428805895.0 | 2016-8 |
3 | 4201363150 | 780680976 | 2016-08-30 | 2016-09-06 | 2016 | 9 | 123145816.0 | 2016-9 |
4 | 4201399909 | 803209782 | 2016-10-27 | 2016-10-27 | 2016 | 10 | 3903690.0 | 2016-10 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
149 | 4202358297 | 45102274 | 2021-01-20 | 2021-01-20 | 2021 | 1 | 95573.0 | 2021-1 |
150 | 4202371214 | 788104334 | 2021-02-11 | 2021-02-11 | 2021 | 2 | 86583.0 | 2021-2 |
151 | 4202430353 | 16426298 | 2021-05-20 | 2021-05-20 | 2021 | 5 | 31004.0 | 2021-5 |
152 | 4202441583 | 45102274 | 2021-06-09 | 2021-06-09 | 2021 | 6 | 39096.0 | 2021-6 |
153 | AMS: ADV PROGRAMS | 2742497 | 2018-01-15 | 2018-01-15 | 2018 | 1 | 584200.0 | 2018-1 |
154 rows × 8 columns
contract_timeline = contract_timeline.sort_values('start_date')
contract_timeline
con_des, con_des_num = np.unique(contract_info['subaward_description'], return_counts = True)
con_amount = contract_info.groupby('subaward_description').apply(lambda df: df['subaward_amount'].sum()).reset_index()
con_amount.columns = ['description', 'amount']
con_count = pd.DataFrame()
con_count['description'] = con_des
con_count['count'] = con_des_num
con_des_info = pd.merge(con_count, con_amount, how = 'left', on = 'description')
con_des_info['ave_amount'] = con_des_info['amount']/con_des_info['count']
fig = px.scatter(con_des_info, x = "ave_amount", y = 'count',
size = "ave_amount", color = 'ave_amount',
hover_name = "description", log_x = True)
fig.show()
token_score = pd.DataFrame(columns = con_des, index = con_des)
for i in range(len(token_score)):
for j in range(len(token_score)):
token_score.iloc[i,j] = fuzz.token_sort_ratio(token_score.index[i] ,token_score.columns[j])
fig = px.imshow(token_score, text_auto=True, width=1000, height=1000)
fig.update_yaxes(showticklabels=False)
fig.update_xaxes(showticklabels=False)
fig.show()