events = [
    ("Control of Fire", -400000),
    ("Modern Humans", -300000),
    ('Farming', -12000),
    ("Written Records", -3200),
    ("Printing Press", 1450),
    ("Scientific Revolution", 1600),
    ("Statistical Thinking", 1750),
    ("Industrial Revolution", 1780),
    ("Digital Computers", 1940),
    ("Internet & Web", 1990),
    ("Mobile & Social Era", 2000),
    ("Today", 2025),
]


plt.figure(figsize=(12, 2))
plt.hlines(1, events[0][1], events[-1][1], color='gray')

for label, year in events:
    plt.plot(year, 1, marker='|', color='blue')
    plt.text(year, 1.05, label, rotation=45)
    plt.text(year, 0.9, str(year), rotation=45)

plt.yticks([]), plt.xticks([]), plt.box(False);


events = [
#     ("Control of Fire", -400000),
#     ("Modern Humans", -300000),
#     ('Farming', -12000),
    ("Written Records", -3200),
    ("Printing Press", 1450),
    ("Scientific Revolution", 1600),
    ("Statistical Thinking", 1750),
    ("Industrial Revolution", 1780),
    ("Digital Computers", 1940),
    ("Internet & Web", 1990),
    ("Mobile & Social Era", 2000),
    ("Today", 2025),
]


plt.figure(figsize=(12, 2))
plt.hlines(1, events[0][1], events[-1][1], color='gray')

for label, year in events:
    plt.plot(year, 1, marker='|', color='blue')
    plt.text(year, 1.05, label, rotation=45)
    plt.text(year, 0.91, str(year), rotation=45)

plt.yticks([]), plt.xticks([]), plt.box(False);


events = [
#     ("Control of Fire", -400000),
#     ("Modern Humans", -300000),
#     ('Farming', -12000),
#     ("Written Records", -3200),
    ("Printing Press", 1450),
    ("Scientific Revolution", 1600),
    ("Statistical Thinking", 1750),
    ("Industrial Revolution", 1780),
    ("Digital Computers", 1940),
    ("Internet & Web", 1990),
    ("Mobile & Social Era", 2000),
    ("Today", 2025),
]


plt.figure(figsize=(12, 2))
plt.hlines(1, events[0][1], events[-1][1], color='gray')

for label, year in events:
    plt.plot(year, 1, marker='|', color='blue')
    plt.text(year, 1.05, label, rotation=45)
    plt.text(year, 0.91, str(year), rotation=45)

plt.yticks([]), plt.xticks([]), plt.box(False);


df = pd.read_xml('data/languagelearning/posts.xml')
df.dtypes
# Which of these columns are relevent to our question?

Id                         int64
PostTypeId                 int64
CreationDate              object
Score                      int64
ViewCount                float64
Body                      object
OwnerUserId              float64
LastEditorUserId         float64
LastEditDate              object
LastActivityDate          object
Title                     object
Tags                      object
AnswerCount              float64
CommentCount               int64
ContentLicense            object
ClosedDate                object
AcceptedAnswerId         float64
ParentId                 float64
LastEditorDisplayName     object
OwnerDisplayName          object
FavoriteCount            float64
CommunityOwnedDate        object
dtype: object


df = df.drop(['Id', 'PostTypeId', 'Score', 'ViewCount', 'Body',
       'OwnerUserId', 'LastEditorUserId', 'LastEditDate', 'LastActivityDate',
       'Title', 'Tags', 'AnswerCount', 'CommentCount', 'ContentLicense',
       'ClosedDate', 'AcceptedAnswerId', 'ParentId', 'LastEditorDisplayName',
       'OwnerDisplayName', 'FavoriteCount', 'CommunityOwnedDate'], axis=1)
df.head()


df['CreationDate'] = pd.to_datetime(df['CreationDate'])
df.head()


# This snippet calculates number of posts per month
df['year_month'] = df['CreationDate'].dt.to_period('M').dt.to_timestamp()
volume_per_month = df['year_month'].value_counts().sort_index()

# This snippet calculates the y tick locations for the graph 
tick_positions = []
tick_labels = []
previous_year = None
for date in volume_per_month.index:
    year = date.year
    if year != previous_year:
        tick_positions.append(date)
        tick_labels.append(str(year))
        previous_year = year
volume_per_month

2016-04-01    474
2016-05-01     90
2016-06-01    203
2016-07-01     73
2016-08-01    168
             ... 
2023-11-01     18
2023-12-01     10
2024-01-01     15
2024-02-01     11
2024-03-01     10
Name: year_month, Length: 96, dtype: int64


fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(volume_per_month.index, volume_per_month.values, marker='', linestyle='-', color='black')
ax.set_title('Volume of Posts per Month on Language Learning Stack Exchange')
ax.set_xlabel('Date')
ax.set_ylabel('Count');
# What do you notice about this graph?


volume_per_month = volume_per_month.iloc[4:]
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(volume_per_month.index, volume_per_month.values, marker='', linestyle='-', color='black')
ax.set_title('Volume of Posts per Month on Language Learning Stack Exchange')
ax.set_xlabel('Date')
ax.set_ylabel('Count');
# What do we see in this graph? How do we interpret it?
# Are the number of posts increasing or decreasing?
# Do you see any patterns?


df = pd.read_csv('../../Research/tj_data/data/twitter_user.csv')
df = df[df['verified'] == False].sample(1000, random_state=1)
df.head()


plt.figure(figsize=(12, 4))
sns.countplot(x='lang', data=df, color='white', edgecolor='black', order = df['lang'].value_counts().index)

<AxesSubplot: xlabel='lang', ylabel='count'>


plt.figure(figsize=(12, 4))
sns.histplot(data=df, x='followers_count', color='white', edgecolor='black', linewidth=1);


plt.figure(figsize=(12, 4))
sns.histplot(data=df, x='followers_count', color='white', edgecolor='black', linewidth=1)
plt.xlim(0, 1500);


plt.figure(figsize=(12, 4))
sns.boxplot(data=df, x='followers_count', color='white')

<AxesSubplot: xlabel='followers_count'>


plt.figure(figsize=(12, 4))
sns.boxplot(data=df, x='followers_count', color='white')
plt.xlim(0, 1500);


plt.figure(figsize=(8, 4))
sns.scatterplot(data=df, x='followers_count', y='friends_count', color='black');


df['popular'] = df.apply(lambda x: x.followers_count > 900000, axis=1)
df[df['popular'] == True].head(5)


df['popular'] = df.apply(lambda x: x.friends_count > 100000, axis=1)
df[df['popular'] == True].head(5)


plt.figure(figsize=(4, 4))
sns.scatterplot(data=df, x='followers_count', y='friends_count', color='black')
plt.xlim(0, 6000), plt.ylim(0, 6000);


plt.figure(figsize=(4, 4))
sns.scatterplot(data=df, x='followers_count', y='friends_count', hue='default_profile')
plt.xlim(0, 6000), plt.ylim(0, 6000);


# Basic graph properties
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")
print(f"Density: {nx.density(G):.3f}")

Number of nodes: 25
Number of edges: 69
Average degree: 5.52
Density: 0.230


# Degree centrality: who has the most connections?
deg_cent = nx.degree_centrality(G)
sorted_deg = sorted(deg_cent.items(), key=lambda x: x[1], reverse=True)
print("Top 5 nodes by degree centrality:")
for node, val in sorted_deg[:5]:
    print(f"{node}: {val:.3f}")

Top 5 nodes by degree centrality:
u2: 0.375
u13: 0.333
u12: 0.292
u20: 0.292
u6: 0.250


# Betweenness centrality: who sits between groups?
bet_cent = nx.betweenness_centrality(G)
sorted_bet = sorted(bet_cent.items(), key=lambda x: x[1], reverse=True)
print("Top 5 nodes by betweenness centrality:")
for node, val in sorted_bet[:5]:
    print(f"{node}: {val:.3f}")

Top 5 nodes by betweenness centrality:
u2: 0.144
u12: 0.087
u19: 0.080
u15: 0.077
u9: 0.065


# Clustering coefficient: how tight are local friend groups?
clust = nx.clustering(G)
avg_clust = nx.average_clustering(G)
print(f"Average clustering coefficient: {avg_clust:.3f}")

Average clustering coefficient: 0.160

	CreationDate
0	2016-04-05T17:20:52.593
1	2016-04-05T17:24:31.210
2	2016-04-05T17:28:48.063
3	2016-04-05T17:29:12.303
4	2016-04-05T17:30:36.530

	CreationDate
0	2016-04-05 17:20:52.593
1	2016-04-05 17:24:31.210
2	2016-04-05 17:28:48.063
3	2016-04-05 17:29:12.303
4	2016-04-05 17:30:36.530

Computers Do	Humans Do
Process huge amounts of data	Notice patterns, anomalies, stories
Follow clear instructions	Ask questions, challenge assumptions
Repeat tasks perfectly	Interpret results creatively
Find correlations	Understand causes and meaning

	id	created_at	lang	name	screen_name	location	description	url	followers_count	friends_count	favourites_count	listed_count	statuses_count	protected	verified	default_profile	default_profile_image	access
20984	3018707070	2015-02-13 18:05:46	en	ClashBot	ClashBotOrg	NaN	ClashBot Plays Clash of Clans for you. ClashBot automatically stays online, collects, trains and farms millions every day.	http://Boostbot.org/forums/	2190	15	70	2	208	False	False	False	False	2017-04-29 08:24:35
101241	2434206150	2014-04-08 20:25:49	en	Mid-Market Finance Expert	SMBloanguru	Austin, TX	Since 1994 assisted hundreds of mid-market B2B businesses with growth capital. Capitalist and entrepreneurial oriented.	NaN	822	2297	10726	39	9308	False	False	True	False	2018-09-21 18:09:18
17024	4286952677	2015-11-26 16:12:46	ru	Димыч #Одесса 🇺🇦	Kovboentij	NaN	#РабыДляНацгвардии #СисекМногоНеБывает #Партия_сисек #Партія_грудей	NaN	1460	1279	8920	17	25926	False	False	True	False	2017-01-01 23:00:45
4654	203244192	2010-10-15 21:15:25	en	Alex	111its	NaN	Rocket ship builder. Спокойно. 21ый век только начался. Жаар	NaN	52	178	6473	0	3468	False	False	True	False	2014-11-13 23:29:46
139881	1098212514	2013-01-17 13:53:52	NaN	Asad	a5adali	England, United Kingdom	A Humble Muslim, Proud Pakistani. speak and support truth.	NaN	61	105	307	0	351	False	False	True	False	2019-05-30 11:42:49

A Brief Introduction to Data Science¶

Prof. Dr. Rebekah Overdorf¶

Chair for Security and Trustworthiness of Online Information¶

Personal Introduction¶

People use the Internet to do Bad Things¶

A) DIY Attacks¶

Authorship Attribution¶

Privacy Leakage¶

Attacks as Contestation¶

Personal Introduction¶

People use the Internet to do Bad Things¶

A) DIY Attacks¶

B) Measuring Real Attacks¶

Social Media Bots¶

Deception¶

Datasets¶

Why Data Science?¶

Why Data Science... now?¶

Important Data Related Events in History¶

What is Data Science all about?¶

What is Data Science all about?¶

What is Data?¶

Primary Types of Data¶

1. Quantitative (Numeric) Data¶

a) Discrete:¶

b) Continuous:¶

Primary Types of Data¶

2. Qualitative (Categorical) Data¶

a) Nominal:¶

b) Ordinal:¶

Think, Pair, Share¶

Think, Pair, Share¶

More Types of Data¶

Data Structures¶

Data Wrangling¶

What Is Data Wrangling?¶

Data Cleaning¶

Drop Excess Columns¶

Fix the timestamps¶

Important Point: Data Science Is About More Than Code¶

Creativity in Data Science¶

Data Analysis¶

Descriptive Statistics¶

Descriptive Statistics¶

Exploring Data Visually¶

1 Variable¶

2 Variables¶

Network Data¶

Real-world Example¶

Real-world Example¶

Nodes don't have to be people¶

Wrap Up¶