# this should work on Mac (if wget is installed) or Unix
!wget -O - --no-check-certificate https://files.pushshift.io/twitter/US_PoliticalTweets.tar.gz | tar -xz

--2021-11-05 14:09:55--  https://files.pushshift.io/twitter/US_PoliticalTweets.tar.gz
Resolving files.pushshift.io (files.pushshift.io)... 104.21.55.251, 172.67.174.211, 2606:4700:3033::6815:37fb, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.21.55.251|:443... connected.
WARNING: cannot verify files.pushshift.io's certificate, issued by ‘CN=Cloudflare Inc ECC CA-3,O=Cloudflare\\, Inc.,C=US’:
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 240797053 (230M) [application/octet-stream]
Saving to: ‘STDOUT’

-                   100%[===================>] 229.64M  10.9MB/s    in 23s     

2021-11-05 14:10:19 (10.1 MB/s) - written to stdout [240797053/240797053]

x = 5

!du -h *.json

1.6G	tweets.json
916K	users.json

# numpy and pandas for data-wrangling
import numpy as np
import pandas as pd
# matplotlib for plotting
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 100

import json
with open('users.json', 'r') as infile:
    users = [json.loads(line) for line in infile.readlines()]
user_df = pd.DataFrame(users)
len(user_df)

548

user_df.sort_values(by='followers_count', ascending=False)[['screen_name', 'followers_count']].head(10)

!head -n 1 tweets.json | python -m json.tool

{
    "contributors": null,
    "coordinates": null,
    "created_at": 1217870931,
    "display_text_range": [
        0,
        74
    ],
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [],
        "user_mentions": []
    },
    "favorite_count": 0,
    "favorited": false,
    "geo": null,
    "id": 877418565,
    "id_str": "877418565",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "is_quote_status": false,
    "lang": "en",
    "place": null,
    "retweet_count": 0,
    "retweeted": false,
    "screen_name": "JohnBoozman",
    "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
    "text": "On a conference call about a weekend trip to Iraq to visit Arkansas troops",
    "truncated": false,
    "user_id": 5558312
}

with open('tweets.json', 'r') as infile:
    tweets = []
    for line in infile:
        tweet = json.loads(line)
        tweet = {
            'screen_name': tweet['screen_name'],
            'favorite_count': tweet['favorite_count'],
            'retweet_count': tweet['retweet_count'],
            'text': tweet['text'],
        }
        tweets.append(tweet)
tweet_df = pd.DataFrame(tweets)
len(tweet_df)

1243370

counts = tweet_df.screen_name.value_counts()
counts.head(10)

RepDonBeyer        3258
SenatorDurbin      3252
GovMattBevin       3250
MassGovernor       3250
onetoughnerd       3249
SenTomCotton       3249
GrahamBlog         3249
SenRonJohnson      3249
RepScottPeters     3248
GovMalloyOffice    3247
Name: screen_name, dtype: int64

plt.hist(counts, bins=100)
plt.title(f"Distribution of {len(counts)} US politician's total tweets in our data")
plt.xlabel("Total tweets")
plt.ylabel("Number of politicians")
plt.annotate(f'{sum(counts >= 3200) / len(counts):.1%} of politicians have 3200+ tweets', (3200, 100), xytext=(2500, 100), arrowprops={'arrowstyle': 'simple'}, va='center', ha='right')
plt.show()

n_train = 100000
n_test = 100000
sampled_tweets = tweet_df.sample(n=n_train + n_test, random_state=0)
len(sampled_tweets)

200000

y = np.array(sampled_tweets.favorite_count, dtype='float32')
y.shape

(200000,)

X = np.array(sampled_tweets.retweet_count, dtype='float32').reshape((-1, 1))
X.shape

(200000, 1)

plt.figure()
xs = X.reshape(-1)
ys = y
plt.hexbin(xs + 1, ys + 1, bins='log', xscale='log', yscale='log', mincnt=1)
plt.xlabel("X (# retweets)")
plt.ylabel("y (# likes)")

i = np.argmax(ys)
plt.annotate(f'{int(xs[i]):,} retweets\n{int(ys[i]):,} likes', (xs[i], ys[i]), xytext=(xs[i] * 10, 500), arrowprops={'arrowstyle': 'simple'}, va='center', ha='center')
print("Highest # of likes:")
print(sampled_tweets.iloc[i].text)

i = np.argmax(xs)
plt.annotate(f'{int(xs[i]):,} retweets\n{int(ys[i]):,} likes', (xs[i], 1), xytext=(xs[i], 5), arrowprops={'arrowstyle': 'simple'}, va='center', ha='right')
print("Highest # of retweets:")
print(sampled_tweets.iloc[i].text)

plt.show()

Highest # of likes:
Peaceful protests are a hallmark of our democracy. Even if I don't always agree, I recognize the rights of people to express their views.
Highest # of retweets:
RT @carterjwm: HELP ME PLEASE. A MAN NEEDS HIS NUGGS https://t.co/4SrfHmEMo3

is_rt = tweet_df.text.map(lambda text: text.startswith('RT')).rename("is_retweet?")
pd.crosstab(is_rt, (tweet_df.favorite_count > 0).rename(">0 likes?"))

# our data spans multiple orders of magnitude: we'll apply a log transformation
X = np.log(X + 1)
y = np.log(y + 1)

X_train = X[:n_train]
X_test = X[n_train:]
X_train.shape, X_test.shape

((100000, 1), (100000, 1))

y_train = y[:n_train]
y_test = y[n_train:]
y_train.shape, y_test.shape

((100000,), (100000,))

def train_model(X_train, y_train):
    import sklearn.linear_model
    model = sklearn.linear_model.LinearRegression()
    model.fit(X_train, y_train)
    return model

model = train_model(X_train, y_train)

intercept = model.intercept_
slope = model.coef_[0]
intercept, slope

(0.36170352, 0.5771629)

xs = X_test.reshape(-1)
ys = y_test
plt.hexbin(xs, ys, bins='log', mincnt=1)
plt.xlabel("X (log # retweets)")
plt.ylabel("y (log # likes)")

xs.sort()
fit_ys = intercept + slope * xs
plt.plot(xs, fit_ys, '--', linewidth=3, color='black', alpha=0.9)

plt.show()

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    # to compute Mean Squared Error, we first compute the absolute difference between
    # the true value and the predicted value
    absolute_errors = np.abs(y_test - y_pred)
    # then, we square the absolute differences and take the mean
    mean_squared_error = np.mean(absolute_errors ** 2)
    return mean_squared_error, absolute_errors, y_pred

mean_squared_error, absolute_errors, y_pred = evaluate_model(model, X_test, y_test)
mean_squared_error

3.7528915

y_pred_actual = np.exp(y_pred) - 1
y_test_actual = np.exp(y_test) - 1

plt.scatter(y_test_actual, y_pred_actual, color='black', marker='.', alpha=0.2)
plt.xlabel("Actual # likes")
plt.ylabel("Predicted # likes")

absolute_errors = np.abs(y_test_actual - y_pred_actual)
correct_inds = absolute_errors <= 5.5
plt.scatter(y_test_actual[correct_inds], y_pred_actual[correct_inds], color=matplotlib.cm.viridis(0.1), marker='.', label=f'Within 5 likes: {np.sum(correct_inds) / len(correct_inds):.1%}')

correct_inds = absolute_errors <= 0.5
plt.scatter(y_test_actual[correct_inds], y_pred_actual[correct_inds], color=matplotlib.cm.viridis(0.7), marker='.', label=f'Exactly correct: {np.sum(correct_inds) / len(correct_inds):.1%}')

plt.legend()

# due to weird axes, we need to tell matplotlib to rescale axes into log
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlim(1, np.max(y_test_actual))
plt.ylim(1, np.max(y_pred_actual))

plt.show()

def simple_tokenize(tweet_text):
    tokens = tweet_text.split()
    return tokens
    
tweet_df['tokens'] = tweet_df.text.map(simple_tokenize)

tweet_df['token_count'] = tweet_df.tokens.map(len)
tweet_df.token_count.max()

42

print(tweet_df[tweet_df.token_count == tweet_df.token_count.max()].text.iloc[0])
print(tweet_df[tweet_df.token_count == tweet_df.token_count.max()].tokens.iloc[0])

.@SciCmteDems @MichaelEMann @RepBonamici @capitalweather @HouseScience @BadAstronomer @NaomiOreskes @ClimateFdbk @VariabilityBlog @RasmusBenestad @TimOsbornClim @AGW_Prof @CubaRaglanGuy @DeformedEarth @epispheric @mzelinka @hausfath @ChrisJMerchant @LovejoyShaun @PeterGleick And I want to invite anyone who wants to help defend science to fact-check future @HouseScience hearings at the hashtag #FactCheckSST 13/13
['.@SciCmteDems', '@MichaelEMann', '@RepBonamici', '@capitalweather', '@HouseScience', '@BadAstronomer', '@NaomiOreskes', '@ClimateFdbk', '@VariabilityBlog', '@RasmusBenestad', '@TimOsbornClim', '@AGW_Prof', '@CubaRaglanGuy', '@DeformedEarth', '@epispheric', '@mzelinka', '@hausfath', '@ChrisJMerchant', '@LovejoyShaun', '@PeterGleick', 'And', 'I', 'want', 'to', 'invite', 'anyone', 'who', 'wants', 'to', 'help', 'defend', 'science', 'to', 'fact-check', 'future', '@HouseScience', 'hearings', 'at', 'the', 'hashtag', '#FactCheckSST', '13/13']

import re

def normalize_token(token):
    # Lowercase and remove all non-alphanumeric characters
    return re.sub("[^0-9a-zA-Z]+", "", token).lower()

def better_tokenize(tweet_text):
    tokens = tweet_text.split()
    tokens_to_keep = []
    for token in tokens:
        if token.startswith('@'):  # replace all @s with a single token
            token = '@'
        elif token.startswith('https://'):  # replace all links with a single token
            token = '/link'
        else:
            token = normalize_token(token)
            if token == '':
                continue
        tokens_to_keep.append(token)
    return tokens_to_keep
    
tweet_df['tokens'] = tweet_df.text.map(better_tokenize)
tweet_df['token_count'] = tweet_df.tokens.map(len)
tweet_df.token_count.max()

42

# here are the longest tweets with the
for row in tweet_df.sort_values(by='token_count', ascending=False).head(5).itertuples():
    print(f"{row.screen_name} ({row.token_count} tokens):\n{row.text}\n{row.tokens}\n")

RepDonBeyer (42 tokens):
.@SciCmteDems @MichaelEMann @RepBonamici @capitalweather @HouseScience @BadAstronomer @NaomiOreskes @ClimateFdbk @VariabilityBlog @RasmusBenestad @TimOsbornClim @AGW_Prof @CubaRaglanGuy @DeformedEarth @epispheric @mzelinka @hausfath @ChrisJMerchant @LovejoyShaun @PeterGleick And I want to invite anyone who wants to help defend science to fact-check future @HouseScience hearings at the hashtag #FactCheckSST 13/13
['scicmtedems', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', 'and', 'i', 'want', 'to', 'invite', 'anyone', 'who', 'wants', 'to', 'help', 'defend', 'science', 'to', 'factcheck', 'future', '@', 'hearings', 'at', 'the', 'hashtag', 'factchecksst', '1313']

GovernorTomWolf (38 tokens):
.@RepLouBarletta @CongBoyle @RepBrady @RepCartwright @RepRyanCostello @RepCharlieDent @USRepMikeDoyle @RepDwightEvans @RepFitzpatrick @MikeKellyPA @RepTomMarino @RepMeehan @RepTimMurphy @RepScottPerry @KeithRothfus @RepBillShuster @RepSmucker @CongressmanGT #VoteNoAHCA: No CBO score, no public hearings, no text of final bill for public. Too important to rush. https://t.co/TPBpwNhUal #SaveACAinPA
['reploubarletta', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', 'votenoahca', 'no', 'cbo', 'score', 'no', 'public', 'hearings', 'no', 'text', 'of', 'final', 'bill', 'for', 'public', 'too', 'important', 'to', 'rush', '/link', 'saveacainpa']

GovernorTomWolf (36 tokens):
.@RepLouBarletta @CongBoyle @RepBrady @RepCartwright @RepRyanCostello @RepCharlieDent @USRepMikeDoyle @RepDwightEvans @RepFitzpatrick @MikeKellyPA @RepTomMarino @RepMeehan @RepTimMurphy @RepScottPerry @KeithRothfus @RepBillShuster @RepSmucker @CongressmanGT #VoteNoAHCA: GOP lawmakers exempt their own insurance from the changes to the AHCA–but not for PA'ians. https://t.co/Oe2VL3SWOp #SaveACAinPA
['reploubarletta', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', 'votenoahca', 'gop', 'lawmakers', 'exempt', 'their', 'own', 'insurance', 'from', 'the', 'changes', 'to', 'the', 'ahcabut', 'not', 'for', 'paians', '/link', 'saveacainpa']

GovernorTomWolf (36 tokens):
.@RepLouBarletta @CongBoyle @RepBrady @RepCartwright @RepRyanCostello @RepCharlieDent @USRepMikeDoyle @RepDwightEvans @RepFitzpatrick @MikeKellyPA @RepTomMarino @RepMeehan @RepTimMurphy @RepScottPerry @KeithRothfus @RepBillShuster @RepSmucker @CongressmanGT #VoteNoAHCA: A House Republican actually said "I don't think any individual has read the whole bill." https://t.co/D8ocKpAYFD #SaveACAinPA
['reploubarletta', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', 'votenoahca', 'a', 'house', 'republican', 'actually', 'said', 'i', 'dont', 'think', 'any', 'individual', 'has', 'read', 'the', 'whole', 'bill', '/link', 'saveacainpa']

RepDonBeyer (36 tokens):
.@ponumurray @ChrisMurphyCT @SenBlumenthal @GerryConnolly @ChrisVanHollen @rosadelauro @RepEsty @Dan_at_Brady @Bradybuzz @NewtownAction In this picture, my friends Andy &amp; Barbara Parker, whose daughter Alison, a tv reporter, was killed by someone who should not have had a gun
['ponumurray', '@', '@', '@', '@', '@', '@', '@', '@', '@', 'in', 'this', 'picture', 'my', 'friends', 'andy', 'amp', 'barbara', 'parker', 'whose', 'daughter', 'alison', 'a', 'tv', 'reporter', 'was', 'killed', 'by', 'someone', 'who', 'should', 'not', 'have', 'had', 'a', 'gun']

plt.hist(tweet_df.token_count, bins=range(0, tweet_df.token_count.max() + 1))
plt.title("Distribution of tokens per tweet")
plt.xlabel("Number of tokens")
plt.ylabel("Number of tweets")
mean_token_count = tweet_df.token_count.mean()
plt.axvline(mean_token_count, color='black', label=f'Mean tokens = {mean_token_count:.1f}')
plt.legend()
plt.show()

from collections import Counter

token_counter = Counter()
for tokens in tweet_df.tokens:
    token_counter.update(tokens)
token_counter.most_common(10)

[('@', 989015),
 ('to', 779941),
 ('the', 735312),
 ('/link', 499420),
 ('of', 367153),
 ('in', 364656),
 ('for', 338542),
 ('a', 279474),
 ('rt', 267394),
 ('on', 266927)]

n_train = 100000
n_test = 100000
sampled_tweets = tweet_df.sample(n=n_train + n_test, random_state=0)

y = np.array(sampled_tweets.favorite_count, dtype='float32')
y = np.log(y + 1)
y.shape

(200000,)

vocabulary = ['/link', 'thanks', 'happy', 'angry', 'the']
V = len(vocabulary)
X = np.zeros((len(sampled_tweets), V), dtype='float32')

for i, tokens in enumerate(sampled_tweets.tokens):
    for j, vocab_token in enumerate(vocabulary):
        if vocab_token in tokens:
            X[i, j] = 1
X.shape

(200000, 5)

# same as above
X_train = X[:n_train]
X_test = X[n_train:]
y_train = y[:n_train]
y_test = y[n_train:]

model = train_model(X_train, y_train)

# what's the performance like for our new model?
mean_squared_error, absolute_errors, y_pred = evaluate_model(model, X_test, y_test)
mean_squared_error

2.6667957

# same visualization as above
y_pred_actual = np.exp(y_pred) - 1
y_test_actual = np.exp(y_test) - 1

plt.scatter(y_test_actual, y_pred_actual, color='black', marker='.', alpha=0.2)
plt.xlabel("Actual # likes")
plt.ylabel("Predicted # likes")

absolute_errors = np.abs(y_test_actual - y_pred_actual)
correct_inds = absolute_errors <= 5.5
plt.scatter(y_test_actual[correct_inds], y_pred_actual[correct_inds], color=matplotlib.cm.viridis(0.1), marker='.', label=f'Within 5 likes: {np.sum(correct_inds) / len(correct_inds):.1%}')

correct_inds = absolute_errors <= 0.5
plt.scatter(y_test_actual[correct_inds], y_pred_actual[correct_inds], color=matplotlib.cm.viridis(0.7), marker='.', label=f'Exactly correct: {np.sum(correct_inds) / len(correct_inds):.1%}')

plt.legend()

# use log scales for the axes
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')

plt.show()

plt.bar(range(V), model.coef_)
plt.axhline(0, color='gray')
plt.xticks(range(V), vocabulary)

plt.show()

# plot word rank vs word count
token_counts = np.array([count for token, count in token_counter.most_common()])
plt.loglog(range(1, len(token_counts)+1), token_counts)

# compute and plot the counts if the true distribution was Zipfian
largest_count = token_counts[0]
true_zipf = [(1, largest_count)]
for i in np.geomspace(1, len(token_counts), endpoint=True, num=25):
    rank = int(i) + 1
    true_zipf.append((rank, largest_count * 1/rank))
xs, ys = zip(*true_zipf)
plt.plot(xs, ys, color='gray', linestyle='-', alpha=0.5)
plt.xlabel("Frequency rank of token")
plt.ylabel("Absolute frequency of token")
plt.title(f"Zipf plot for {len(token_counts):,} unique tokens")
# annotate the hapax legomena 
plt.annotate(f"{sum(token_counts == 1) / len(token_counts):.1%} of tokens appear exactly once",
             xy=(len(token_counts) * 0.2, 1), xytext=(10**4.5, 1), arrowprops={'arrowstyle': 'simple'}, va='center', ha='right')
plt.show()

# how large will our vocabulary be at different absolute token frequency cut-offs?
xs = []
ys =[]
for i in range(2, 10):
    xs.append(i)
    ys.append(np.sum(token_counts >= i))
    
plt.plot(xs, ys)
plt.scatter(xs, ys, color='black', zorder=5)
plt.xlabel("Minimum count for tokens in our vocabulary")
plt.ylabel("Total vocabulary size (|V|)")

plt.show()

V = 30000
token_index_map = {}
index = 0
for token, count in token_counter.most_common(V):
    token_index_map[token] = index
    index += 1

# to check our work, we need to map back from indices to the actual token text
index_token_map = {index: token for token, index in token_index_map.items()}

token_index_map['iraq']

1437

index_token_map[1437]

'iraq'

# the very last word in our vocabulary
index_token_map[29999]

'demos'

n_train = 100000
n_test = 100000
sampled_tweets = tweet_df.sample(n=n_train + n_test, random_state=0)

y = np.array(sampled_tweets.favorite_count, dtype='float32')
y = np.log(y + 1)
y.shape

(200000,)

X = np.zeros((len(sampled_tweets), V), dtype='float32')
for row_ind, tokens in enumerate(sampled_tweets.tokens):
    for token in tokens:
        if token not in token_index_map:
            # this is a word not in the vocabulary!
            continue
        column_ind = token_index_map[token]
        X[row_ind, column_ind] += 1
X.shape

(200000, 30000)

row_ind = 0
print(sampled_tweets.iloc[row_ind].text + "\n")

# this next line is a bit of numpy magic, but it returns the indices that are non-zero in X[row_ind,:]
for column_ind in np.argwhere(X[row_ind,:]).ravel():
    if column_ind == V:
        print(f"ind= {column_ind:>5}  n= {X[row_ind,column_ind]:.0f}  intercept")
    else:
        token = index_token_map[column_ind]
        print(f"ind= {column_ind:>5}  n= {X[row_ind,column_ind]:.0f}  token= {token}")

ICYMI I called on my colleagues in Congress to reform how we calculate Social Security COLAs https://t.co/JOPaHggJe2

ind=     1  n= 1  token= to
ind=     3  n= 1  token= /link
ind=     5  n= 1  token= in
ind=     9  n= 1  token= on
ind=    16  n= 1  token= i
ind=    17  n= 1  token= my
ind=    19  n= 1  token= we
ind=    61  n= 1  token= congress
ind=    79  n= 1  token= how
ind=   172  n= 1  token= security
ind=   174  n= 1  token= icymi
ind=   189  n= 1  token= reform
ind=   333  n= 1  token= colleagues
ind=   721  n= 1  token= social
ind=   921  n= 1  token= called
ind= 25594  n= 1  token= calculate
ind= 26036  n= 1  token= colas

X.nbytes / 1024 / 1024 / 1024  # 22 GB!!!

22.351741790771484

# 99.9% of cells in our matrix are zero!! huge waste of memory
np.sum(X == 0) / (X.shape[0] * X.shape[1])

0.999485602

import scipy.sparse
X = scipy.sparse.coo_matrix(X)

# NNZ = "Number of Non-Zero" values stored in the matrix
n_stored_bytes = (X.nnz * 4)
n_stored_bytes / 1024 / 1024 / 1024  # 0.01 GB!

0.01149769127368927

X = X.tocsr()

# same as above
X_train = X[:n_train]
X_test = X[n_train:]
y_train = y[:n_train]
y_test = y[n_train:]
X_train.shape, X_test.shape

((100000, 30000), (100000, 30000))

import sklearn.linear_model
model = sklearn.linear_model.Ridge(alpha=10.0, solver='sparse_cg')
model.fit(X_train, y_train)

Ridge(alpha=10.0, solver='sparse_cg')

mean_squared_error, absolute_errors, y_pred = evaluate_model(model, X_test, y_test)
mean_squared_error

1.7708157

# same visualization as above
y_pred_actual = np.exp(y_pred) - 1
y_test_actual = np.exp(y_test) - 1

plt.scatter(y_test_actual, y_pred_actual, color='black', marker='.', alpha=0.2)
plt.xlabel("Actual # likes")
plt.ylabel("Predicted # likes")

absolute_errors = np.abs(y_test_actual - y_pred_actual)
correct_inds = absolute_errors <= 5.5
plt.scatter(y_test_actual[correct_inds], y_pred_actual[correct_inds], color=matplotlib.cm.viridis(0.1), marker='.', label=f'Within 5 likes: {np.sum(correct_inds) / len(correct_inds):.1%}')

correct_inds = absolute_errors <= 0.5
plt.scatter(y_test_actual[correct_inds], y_pred_actual[correct_inds], color=matplotlib.cm.viridis(0.7), marker='.', label=f'Exactly correct: {np.sum(correct_inds) / len(correct_inds):.1%}')

plt.legend()

# use log scales for the axes
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlim(1, np.max(y_test_actual))
plt.ylim(1, np.quantile(y_pred_actual, 0.9999))

plt.show()

n = 20
w = model.coef_[:n]
plt.bar(range(n), w)
plt.axhline(0, color='gray')
plt.ylabel("Weight (Model coefficient)")
token_labels = [index_token_map[i] for i in range(n)]
plt.xticks(range(n), token_labels, rotation=45)

plt.show()

w = np.array(model.coef_)
w.sort()
plt.vlines(range(len(w)), np.minimum(w, 0), np.maximum(w, 0), linewidth=0.5)
plt.xlabel("Features (sorted by weight)")
plt.ylabel("Weight (Model coefficient)")
plt.annotate(f'Lowest weight ({w[0]:.2f})', xy=(0, w[0]), xytext=(5000, w[0]), arrowprops={'arrowstyle': 'simple'}, va='center', ha='left')
plt.annotate(f'Highest weight ({w[-1]:.2f})', xy=(30000, w[-1]), xytext=(25000, w[-1]), arrowprops={'arrowstyle': 'simple'}, va='center', ha='right')
plt.annotate(f'{np.sum(w == 0) / len(w):.2%} of the weights are zero', xy=(15000, 0), xytext=(15000, 1), arrowprops={'arrowstyle': 'simple'}, va='center', ha='center')
plt.show()

w = model.coef_
weights = []
for i, weight in enumerate(w):
    token = index_token_map[i]
    weights.append({
        'token': token,
        'token_count': token_counter[token],
        'weight': weight,
        'abs_weight': np.abs(weight),
    })
        
weight_df = pd.DataFrame(weights)
len(weight_df)

30000

weight_df.sample(n=5)

weight_df = weight_df.sort_values(by='weight', ascending=False)

weight_df.head(10)

weight_df.tail(10)

tweet_df[tweet_df.tokens.map(lambda tokens: 'brownback' in tokens)].sample(n=5)[['screen_name', 'text', 'favorite_count']]

# we exponentiate the values to "undo" the log transform we applied
actual = np.exp(y_test) - 1
# and for the predicted values, we need to round
# we'll use Numpy's around() for this, although this will introduce some slight rounding error
predicted = np.around(np.exp(y_pred) - 1)
# there's a further issue: we are using a model that can produce negative predictions
print(f"{np.sum(predicted < 0) / len(predicted):.1%} of predictions are below zero pre-correction")
# we'll fix this by forcing all negative values to be 0 (which will increase our accuracy compared to our previous comparison)
predicted = np.maximum(predicted, 0)

errors = actual - predicted
print(f"{np.sum(errors == 0) / len(errors):.1%} predict the correct number of likes")
print(f"{np.sum(errors > 0) / len(errors):.1%} under-predict number of likes")
print(f"{np.sum(errors < 0) / len(errors):.1%} over-predict number of likes")
print(f"Mean prediction error: {np.mean(errors):.2f}")

# we use Numpy quantile() to only show up to the 99th percentile of true values
bins = np.linspace(0, np.quantile(actual, 0.99))
plt.hist(actual, bins=bins, alpha=0.5, log=True, label="True")
plt.hist(predicted, bins=bins, alpha=0.5, log=True, label="Predicted")
plt.legend()
plt.xlabel("# of likes")
plt.ylabel("Tweet count")
plt.show()

3.0% of predictions are below zero pre-correction
21.4% predict the correct number of likes
32.0% under-predict number of likes
46.6% over-predict number of likes
Mean prediction error: 192.00

	screen_name	followers_count
150	realDonaldTrump	31712585
147	POTUS	18545354
509	SenSanders	5072538
31	CoryBooker	3094413
118	marcorubio	2554822
520	SenWarren	2412087
501	SenJohnMcCain	2274034
131	NancyPelosi	1126205
90	JerryBrownGov	1119275
515	SenTedCruz	960413

	token	token_count	weight	abs_weight
28760	underutilized	17	-0.042325	0.042325
28099	230p	18	0.000000	0.000000
9431	repdianeblack	115	-0.142293	0.142293
6886	engines	190	0.012400	0.012400
11457	espaillatny	84	-0.044266	0.044266

	token	token_count	weight	abs_weight
7964	makeamericagreatagain	151	3.021243	3.021243
11472	trump2016	84	2.750839	2.750839
5716	crooked	253	2.745887	2.745887
7774	americafirst	157	2.612355	2.612355
9204	dishonest	119	2.159368	2.159368
12561	industrialized	72	2.118224	2.118224
2189	devos	1009	2.000655	2.000655
19142	crookedhillary	36	1.916129	1.916129
6957	flynns	188	1.734988	1.734988
13353	bigleaguetruth	65	1.656239	1.656239

	token	token_count	weight	abs_weight
919	ia03	2742	-0.714490	0.714490
5009	hagel	311	-0.714975	0.714975
11305	pittenger	86	-0.737141	0.737141
5798	hassans	248	-0.769951	0.769951
1479	sctweets	1655	-0.807138	0.807138
3922	franken	455	-0.837031	0.837031
6299	tedstrickland	218	-0.849467	0.849467
8531	mow50	135	-0.861965	0.861965
2356	brownback	908	-1.101062	1.101062
8	rt	267394	-1.668904	1.668904

	screen_name	text	favorite_count
124361	govsambrownback	Watch the video of Gov. #Brownback on @Bloombe...	2
67571	govsambrownback	LtGov Colyer: Leg Oversight Essential to Provi...	0
19606	govsambrownback	Don't live in Wichita and want to watch tonigh...	0
91785	govsambrownback	Gov #Brownback Makes Appointment to the 10th J...	0
43611	govsambrownback	Gov Brownback met w/ employees of Triumph Stru...	0

Basics of machine learning with text data: bag-of-words for linear regression

Downloading and reading the data¶

Machine learning basics: Vectorizing tweets and Training a model¶

Tokenizing texts¶

Creating vectors from tokens¶

Comparing models¶

Inspecting the model's weights¶

Turning tokens into bag-of-words¶

Zipf's Law¶

Vectorizing texts using our vocabulary¶

Sidebar: using sparse matrices to save memory¶

Training a model using the document-term matrix¶

Error analysis¶

Next steps: Improving our vector representations¶

>0 likes?	False	True
is_retweet?
False	224654	762208
True	252181	4327