After reading in Bloomberg Businessweek that the subreddit r/WallStreetBets was likely behind some interesting stock trends with Virgin Galactic (SPCE) and Power Plug (PLUG), I decided to explore these relation ships myself.
import pandas as pd
from pandas.io.json import json_normalize as jnorm
import numpy as np
import requests
import json
from datetime import datetime
import traceback
import re
from datetime import datetime
from datetime import timedelta
I put together a function to retrieve posts or comments from subreddit within any time period (last 30 days for example) and save it in a structured CSV format.
#subreddit = "wallstreetbets"
def downloadFromSubreddit(filename, object_type, days_back,
subreddit): #after_date format: 100 days as '100d'
url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&subreddit={}&before="
start_time = datetime.utcnow()
print(f"Saving {object_type}s to {filename}")
count = 0
handle = open(filename, 'w')
starttime = start_time.timestamp()
previous_epoch = int(starttime)
stoptime = int(starttime) - int(days_back) * 86400
while previous_epoch > stoptime:
new_url = url.format(object_type, subreddit) + str(previous_epoch)
json = requests.get(
new_url, headers={'User-Agent': "/u/isaacwino"})
json_data = json.json()
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == 'comment':
try:
text = object['body']
if text != '[removed]':
handle.write(str(object['score']))
handle.write("\t")
handle.write(
datetime.fromtimestamp(
object['created_utc']).strftime(
"%Y-%m-%d-%H-%M-%S"))
handle.write("\t")
textASCII = text.encode(encoding='ascii',
errors='ignore').decode()
textASCII = re.sub('\n', ' ', textASCII)
textASCII = re.sub('\t', ' ', textASCII)
handle.write(textASCII)
handle.write("\n")
except Exception as err:
print(
f"Couldn't print comment: https://www.reddit.com{object['permalink']}"
)
print(traceback.format_exc())
elif object_type == 'submission':
if object['is_self']:
if 'selftext' not in object:
continue
try:
text = object['selftext']
if text != '[removed]':
handle.write(str(object['score']))
handle.write("\t")
handle.write(
datetime.fromtimestamp(
object['created_utc']).strftime(
"%Y-%m-%d-%H-%M-%S"))
handle.write("\t")
textASCII = text.encode(encoding='ascii',
errors='ignore').decode()
textASCII = re.sub('\n', ' ', textASCII)
textASCII = re.sub('\t', ' ', textASCII)
handle.write(textASCII)
handle.write("\n")
except Exception as err:
print(f"Couldn't print post: {object['url']}")
print(traceback.format_exc())
print("Saved {} {}s through {}".format(
count, object_type,
datetime.fromtimestamp(previous_epoch).strftime("%Y-%m-%d")))
print(f"Saved {count} {object_type}s")
handle.close()
#USAGE EXAMPLE:
#downloadFromSubreddit("comments_90days.csv", "comment", '90', 'wallstreetbets')
comments = pd.read_csv('comments_90days.csv',
sep='\t',
names=['Score', 'Time', 'Comment'],
lineterminator='\n')
#Convert the Timestamp Column into a time index for the dataframe
comments.Time = pd.to_datetime(comments.Time, format='%Y-%m-%d-%H-%M-%S')
comments = comments.set_index(pd.to_datetime(comments['Time']))
comments.drop('Time', axis=1, inplace=True)
#Take a gander at that pretty dataframe
comments.head()
#Counts the frequency of the stock ticker name (uppercase and lowercase mentions)
import re
def word_freq2(array, word):
input_string = array.to_string(index=False)
input_string = input_string.lower()
count = sum(1 for _ in re.finditer(r'\b%s\b' %
re.escape(word.lower()), input_string))
return count
#Gets the mentions over time in prefered time interval
def getMentionsOverTime(stock,interval = 'd', return_df = False):
term_df = pd.DataFrame(columns = ["Time", 'Mentions of {}'.format(stock)])
time = []
mentions = []
commentsGrouped = comments.groupby(pd.Grouper(freq=interval))
for name, group in commentsGrouped.Comment:
mentions.append(word_freq2(group, stock))
time.append(name)
term_df['Time'] = time
term_df['Mentions of {}'.format(stock)] = mentions
term_df.Time = pd.to_datetime(term_df.Time, format = '%Y-%m-%d-%H-%M-%S')
term_df = term_df.set_index(pd.to_datetime(term_df['Time']))
term_df.drop('Time', axis = 1, inplace = True)
if return_df == False:
return term_df.plot()
else:
return term_df
getMentionsOverTime('AAPL')
getMentionsOverTime('SPCE')
getMentionsOverTime('TSLA')
#Write to CSV
# AAPL_mentions = getMentionsOverTime('AAPL', return_df = True)
# TSLA_mentions = getMentionsOverTime('TSLA', return_df = True)
# AAPL_mentions.to_csv('AAPL_mentions_perday_90days.csv')
# TSLA_mentions.to_csv('TSLA_mentions_perday_90days.csv')