events = [
("Control of Fire", -400000),
("Modern Humans", -300000),
('Farming', -12000),
("Written Records", -3200),
("Printing Press", 1450),
("Scientific Revolution", 1600),
("Statistical Thinking", 1750),
("Industrial Revolution", 1780),
("Digital Computers", 1940),
("Internet & Web", 1990),
("Mobile & Social Era", 2000),
("Today", 2025),
]
plt.figure(figsize=(12, 2))
plt.hlines(1, events[0][1], events[-1][1], color='gray')
for label, year in events:
plt.plot(year, 1, marker='|', color='blue')
plt.text(year, 1.05, label, rotation=45)
plt.text(year, 0.9, str(year), rotation=45)
plt.yticks([]), plt.xticks([]), plt.box(False);
events = [
# ("Control of Fire", -400000),
# ("Modern Humans", -300000),
# ('Farming', -12000),
("Written Records", -3200),
("Printing Press", 1450),
("Scientific Revolution", 1600),
("Statistical Thinking", 1750),
("Industrial Revolution", 1780),
("Digital Computers", 1940),
("Internet & Web", 1990),
("Mobile & Social Era", 2000),
("Today", 2025),
]
plt.figure(figsize=(12, 2))
plt.hlines(1, events[0][1], events[-1][1], color='gray')
for label, year in events:
plt.plot(year, 1, marker='|', color='blue')
plt.text(year, 1.05, label, rotation=45)
plt.text(year, 0.91, str(year), rotation=45)
plt.yticks([]), plt.xticks([]), plt.box(False);
events = [
# ("Control of Fire", -400000),
# ("Modern Humans", -300000),
# ('Farming', -12000),
# ("Written Records", -3200),
("Printing Press", 1450),
("Scientific Revolution", 1600),
("Statistical Thinking", 1750),
("Industrial Revolution", 1780),
("Digital Computers", 1940),
("Internet & Web", 1990),
("Mobile & Social Era", 2000),
("Today", 2025),
]
plt.figure(figsize=(12, 2))
plt.hlines(1, events[0][1], events[-1][1], color='gray')
for label, year in events:
plt.plot(year, 1, marker='|', color='blue')
plt.text(year, 1.05, label, rotation=45)
plt.text(year, 0.91, str(year), rotation=45)
plt.yticks([]), plt.xticks([]), plt.box(False);
Data Rangling
Using Data
Talk to your neighbor. Come up with 2 examples of Quantitative Data and 2 examples of Qualitative Data
Talk to your neighbor. Come up with at least one type of data that does not neatly fit into these categories
Turning messy data into usable data
Goal: make data accurate, consistent, and useful
It involves:
Simple Research Question: have the volume of posts on the language learning stack exchange changed overtime?
df = pd.read_xml('data/languagelearning/posts.xml')
df.dtypes
# Which of these columns are relevent to our question?
Id int64 PostTypeId int64 CreationDate object Score int64 ViewCount float64 Body object OwnerUserId float64 LastEditorUserId float64 LastEditDate object LastActivityDate object Title object Tags object AnswerCount float64 CommentCount int64 ContentLicense object ClosedDate object AcceptedAnswerId float64 ParentId float64 LastEditorDisplayName object OwnerDisplayName object FavoriteCount float64 CommunityOwnedDate object dtype: object
df = df.drop(['Id', 'PostTypeId', 'Score', 'ViewCount', 'Body',
'OwnerUserId', 'LastEditorUserId', 'LastEditDate', 'LastActivityDate',
'Title', 'Tags', 'AnswerCount', 'CommentCount', 'ContentLicense',
'ClosedDate', 'AcceptedAnswerId', 'ParentId', 'LastEditorDisplayName',
'OwnerDisplayName', 'FavoriteCount', 'CommunityOwnedDate'], axis=1)
df.head()
CreationDate | |
---|---|
0 | 2016-04-05T17:20:52.593 |
1 | 2016-04-05T17:24:31.210 |
2 | 2016-04-05T17:28:48.063 |
3 | 2016-04-05T17:29:12.303 |
4 | 2016-04-05T17:30:36.530 |
df['CreationDate'] = pd.to_datetime(df['CreationDate'])
df.head()
CreationDate | |
---|---|
0 | 2016-04-05 17:20:52.593 |
1 | 2016-04-05 17:24:31.210 |
2 | 2016-04-05 17:28:48.063 |
3 | 2016-04-05 17:29:12.303 |
4 | 2016-04-05 17:30:36.530 |
# This snippet calculates number of posts per month
df['year_month'] = df['CreationDate'].dt.to_period('M').dt.to_timestamp()
volume_per_month = df['year_month'].value_counts().sort_index()
# This snippet calculates the y tick locations for the graph
tick_positions = []
tick_labels = []
previous_year = None
for date in volume_per_month.index:
year = date.year
if year != previous_year:
tick_positions.append(date)
tick_labels.append(str(year))
previous_year = year
volume_per_month
2016-04-01 474 2016-05-01 90 2016-06-01 203 2016-07-01 73 2016-08-01 168 ... 2023-11-01 18 2023-12-01 10 2024-01-01 15 2024-02-01 11 2024-03-01 10 Name: year_month, Length: 96, dtype: int64
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(volume_per_month.index, volume_per_month.values, marker='', linestyle='-', color='black')
ax.set_title('Volume of Posts per Month on Language Learning Stack Exchange')
ax.set_xlabel('Date')
ax.set_ylabel('Count');
# What do you notice about this graph?
volume_per_month = volume_per_month.iloc[4:]
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(volume_per_month.index, volume_per_month.values, marker='', linestyle='-', color='black')
ax.set_title('Volume of Posts per Month on Language Learning Stack Exchange')
ax.set_xlabel('Date')
ax.set_ylabel('Count');
# What do we see in this graph? How do we interpret it?
# Are the number of posts increasing or decreasing?
# Do you see any patterns?
Computers Do | Humans Do |
---|---|
Process huge amounts of data | Notice patterns, anomalies, stories |
Follow clear instructions | Ask questions, challenge assumptions |
Repeat tasks perfectly | Interpret results creatively |
Find correlations | Understand causes and meaning |
df = pd.read_csv('../../Research/tj_data/data/twitter_user.csv')
df = df[df['verified'] == False].sample(1000, random_state=1)
df.head()
id | created_at | lang | name | screen_name | location | description | url | followers_count | friends_count | favourites_count | listed_count | statuses_count | protected | verified | default_profile | default_profile_image | access | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
20984 | 3018707070 | 2015-02-13 18:05:46 | en | ClashBot | ClashBotOrg | NaN | ClashBot Plays Clash of Clans for you. ClashBot automatically stays online, collects, trains and farms millions every day. | http://Boostbot.org/forums/ | 2190 | 15 | 70 | 2 | 208 | False | False | False | False | 2017-04-29 08:24:35 |
101241 | 2434206150 | 2014-04-08 20:25:49 | en | Mid-Market Finance Expert | SMBloanguru | Austin, TX | Since 1994 assisted hundreds of mid-market B2B businesses with growth capital. Capitalist and entrepreneurial oriented. | NaN | 822 | 2297 | 10726 | 39 | 9308 | False | False | True | False | 2018-09-21 18:09:18 |
17024 | 4286952677 | 2015-11-26 16:12:46 | ru | Димыч #Одесса 🇺🇦 | Kovboentij | NaN | #РабыДляНацгвардии #СисекМногоНеБывает #Партия_сисек #Партія_грудей | NaN | 1460 | 1279 | 8920 | 17 | 25926 | False | False | True | False | 2017-01-01 23:00:45 |
4654 | 203244192 | 2010-10-15 21:15:25 | en | Alex | 111its | NaN | Rocket ship builder. Спокойно. 21ый век только начался. Жаар | NaN | 52 | 178 | 6473 | 0 | 3468 | False | False | True | False | 2014-11-13 23:29:46 |
139881 | 1098212514 | 2013-01-17 13:53:52 | NaN | Asad | a5adali | England, United Kingdom | A Humble Muslim, Proud Pakistani. speak and support truth. | NaN | 61 | 105 | 307 | 0 | 351 | False | False | True | False | 2019-05-30 11:42:49 |
plt.figure(figsize=(12, 4))
sns.countplot(x='lang', data=df, color='white', edgecolor='black', order = df['lang'].value_counts().index)
<AxesSubplot: xlabel='lang', ylabel='count'>
plt.figure(figsize=(12, 4))
sns.histplot(data=df, x='followers_count', color='white', edgecolor='black', linewidth=1);
plt.figure(figsize=(12, 4))
sns.histplot(data=df, x='followers_count', color='white', edgecolor='black', linewidth=1)
plt.xlim(0, 1500);
plt.figure(figsize=(12, 4))
sns.boxplot(data=df, x='followers_count', color='white')
<AxesSubplot: xlabel='followers_count'>
plt.figure(figsize=(12, 4))
sns.boxplot(data=df, x='followers_count', color='white')
plt.xlim(0, 1500);
plt.figure(figsize=(8, 4))
sns.scatterplot(data=df, x='followers_count', y='friends_count', color='black');
df['popular'] = df.apply(lambda x: x.followers_count > 900000, axis=1)
df[df['popular'] == True].head(5)
id | created_at | lang | name | screen_name | location | description | url | followers_count | friends_count | favourites_count | listed_count | statuses_count | protected | verified | default_profile | default_profile_image | access | popular | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
68321 | 573459961 | 2012-05-07 09:11:38 | en | Psychology | psychologicaI | Science of the mind | Human Psychology -- Social Behaviorism -- Human Development. | NaN | 998542 | 374 | 131 | 2175 | 64673 | False | False | False | False | 2018-02-10 16:33:05 | True |
df['popular'] = df.apply(lambda x: x.friends_count > 100000, axis=1)
df[df['popular'] == True].head(5)
id | created_at | lang | name | screen_name | location | description | url | followers_count | friends_count | favourites_count | listed_count | statuses_count | protected | verified | default_profile | default_profile_image | access | popular | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
133018 | 740483180 | 2012-08-06 12:10:19 | en | ⭐️⭐️⭐️ Suzy says release EVERYTHING ⭐️⭐️⭐️ | suzydymna | Twitter JAIL | Democrats have forgotten, after Hubris always comes Nemesis, in their case Nemesis has a name, it is... Donald J. Trump. | https://www.donaldjtrump.com/ | 139521 | 126176 | 23401 | 299 | 30621 | False | False | True | False | 2019-04-17 08:38:02 | True |
plt.figure(figsize=(4, 4))
sns.scatterplot(data=df, x='followers_count', y='friends_count', color='black')
plt.xlim(0, 6000), plt.ylim(0, 6000);
plt.figure(figsize=(4, 4))
sns.scatterplot(data=df, x='followers_count', y='friends_count', hue='default_profile')
plt.xlim(0, 6000), plt.ylim(0, 6000);
# Basic graph properties
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")
print(f"Density: {nx.density(G):.3f}")
Number of nodes: 25 Number of edges: 69 Average degree: 5.52 Density: 0.230
# Degree centrality: who has the most connections?
deg_cent = nx.degree_centrality(G)
sorted_deg = sorted(deg_cent.items(), key=lambda x: x[1], reverse=True)
print("Top 5 nodes by degree centrality:")
for node, val in sorted_deg[:5]:
print(f"{node}: {val:.3f}")
Top 5 nodes by degree centrality: u2: 0.375 u13: 0.333 u12: 0.292 u20: 0.292 u6: 0.250
# Betweenness centrality: who sits between groups?
bet_cent = nx.betweenness_centrality(G)
sorted_bet = sorted(bet_cent.items(), key=lambda x: x[1], reverse=True)
print("Top 5 nodes by betweenness centrality:")
for node, val in sorted_bet[:5]:
print(f"{node}: {val:.3f}")
Top 5 nodes by betweenness centrality: u2: 0.144 u12: 0.087 u19: 0.080 u15: 0.077 u9: 0.065
# Clustering coefficient: how tight are local friend groups?
clust = nx.clustering(G)
avg_clust = nx.average_clustering(G)
print(f"Average clustering coefficient: {avg_clust:.3f}")
Average clustering coefficient: 0.160