import pandas as pd
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from functions import *
plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})
colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))
current_dir = '/Users/BrunoHenriques/Desktop/OneDrive/Workspace_bitbucket/datascience/Web_scraping'
#GBP to EURO current
response = requests.get('http://www.x-rates.com/calculator/?from=GBP&to=EUR&amount=1')
content = response.content
parser = BeautifulSoup(content,'html.parser')
xrate = parser.select('.ccOutputRslt')
print(xrate[0].text)
#Any date and currency will do (use GBP)
date='2017-10-16'
response = requests.get('http://www.x-rates.com/historical/?from=GBP&amount=1&date='+date)
content = response.content
parser = BeautifulSoup(content,'html.parser')
#this will have a name and conversion between GBP and all other currencies
xrate = parser.select('a[href*="/graph/?from"]')
#extract the names of other currencies
currency_list = []
for element in xrate:
text = re.sub('.*from=','', element['href'])
text = re.sub('&to=',' ', text)
if(re.match('^GBP', text)):
currency_list.append(re.sub('GBP ','', text))
currency_list.append('GBP')
#append the use set to get the unique values then convert back to list
print(list(set(currency_list)))
currency = 'EUR'
date = pd.date_range(start='01/01/2018', end='09/30/2018', freq='d')
date = [str(element.date()) for element in date]
df_currency_list = pd.Series(currency_list)
df_currency_list = df_currency_list[~df_currency_list.duplicated(keep='first')]
df_currency_list = df_currency_list.reset_index(drop=True)
df = pd.DataFrame(np.zeros((len(date), len(df_currency_list))), index=date, columns=df_currency_list)
for idx, xx in enumerate(date):
if(idx%100==0):
print('doing day %d of %d\n' % (idx, len(date)))
response = requests.get('http://www.x-rates.com/historical/?from='+currency+'&amount=1&date='+xx)
content = response.content
parser = BeautifulSoup(content,'html.parser')
xrate = parser.select('.tablesorter')
xrate = parser.select('a[href*="/graph/?from"]')
for yy in xrate:
text = re.sub('.*from=','', yy['href'])
text = re.sub('&to=',' ', text)
if(re.match('^'+currency, text)):
target_currency = (re.sub(currency+' ','', text))
df.loc[xx][target_currency] = yy.text
#write data
df.to_csv(current_dir+'/xrates_data/'+currency+'.csv', header=True)
print('\n\nDONE')
currency = 'EUR'
df = pd.read_csv(current_dir+'/xrates_data/'+currency+'.csv', index_col=0)
display(df[:2])
date = pd.to_datetime(df.index)
start_date = '2018-01-01'
end_date = '2018-10-01'
date = date[(date>=start_date) & (date<=end_date)]
fig = plt.figure(figsize=(10,5))
ax = plt.subplot()
ax.plot(date, df['GBP'].loc[start_date: end_date].values)
#format date axis
days_locator = mdates.DayLocator(bymonthday=[1,])
ax.xaxis.set_major_locator(days_locator)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax.tick_params(axis='x', rotation=90)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([start_date, end_date])
plt.show()
from scipy.fftpack import fft, ifft
from scipy import zeros
y = df['GBP'].loc[start_date: end_date].values
x = date
n = len(y)
COMPONENTS = [20, n]
#ax = plt.subplot()
for c in COMPONENTS:
fig = plt.figure(figsize=(10,5))
colors = np.linspace(start=100, stop=255, num=c)
for i in range(c):
#Compute the one-dimensional discrete Fourier Transform
Y = np.fft.fft(y)
#replace all Fourier components above c with 0
np.put(Y, range(i+1, n), 0.0)
#Compute the one-dimensional inverse discrete Fourier Transform
ifft = np.fft.ifft(Y)
plt.plot(x, ifft, color=plt.cm.Reds(int(colors[i])), alpha=.70)
plt.tick_params(axis='x', rotation=45)
plt.title("First %d fourier components" %(c))
plt.plot(x,y, label="Original dataset")
plt.grid(linestyle='dashed')
plt.legend()
plt.show()
y = df['GBP'].loc[start_date: end_date].values
x = date
n = len(y)
COMPONENTS = [10, 20, 30]
#COMPONENTS = [30]
fig = plt.figure(figsize=(10,5))
ax = plt.subplot()
jj = 0
for c in COMPONENTS:
colors = np.linspace(start=100, stop=255, num=len(COMPONENTS))
#Compute the one-dimensional discrete Fourier Transform
Y = np.fft.fft(y)
#replace all Fourier components above c with 0
np.put(Y, range(c+1, n), 0.0)
#Compute the one-dimensional inverse discrete Fourier Transform
ifft = np.fft.ifft(Y)
ax.plot(x, ifft, color=plt.cm.Reds(int(colors[jj])), alpha=.70)
jj+=1
ax.tick_params(axis='x', rotation=45)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([start_date, end_date])
plt.title("First %d fourier components" %(c))
plt.plot(x,y, label="Original dataset")
plt.grid(linestyle='dashed')
plt.legend()
plt.show()
def get_slope_changes(x,y):
xx = []
yy = []
aux_y = y[0].real
if (y[0].real>y[1].real):
flag = 1
else:
flag = -1
for ii in range(1, len(x)):
if (flag == 1) & (y[ii].real<y[ii-1].real):
xx.append(x[ii])
yy.append(y[ii].real)
flag=-1
if (flag == -1) & (y[ii].real>y[ii-1].real):
xx.append(x[ii])
yy.append(y[ii].real)
flag=1
return xx, yy
from datetime import timedelta
c = 20
n = len(y)
x = date
#Compute the one-dimensional discrete Fourier Transform
Y = np.fft.fft(y)
#replace all Fourier components above c with 0
np.put(Y, range(c+1, n), 0.0)
#Compute the one-dimensional inverse discrete Fourier Transform
ifft = np.fft.ifft(Y)
xx, yy = get_slope_changes(x,ifft)
fig = plt.figure(figsize=(10,5))
ax = plt.subplot()
ax.plot(x, ifft, color='red', alpha=.70)
ax.plot(xx, yy, color='black', alpha=.70)
ax.tick_params(axis='x', rotation=45)
plt.title("First {c} fourier components".format(c=c))
plt.plot(x,y, label="Original dataset")
plt.grid(linestyle='dashed')
plt.legend()
plt.show()
time_delta = []
for ii in range(1,len(xx)):
time_delta.append((xx[ii]-xx[ii-1]).days)
values, counts = np.unique(time_delta, return_counts=True)
for ii in range(0,len(counts)):
print("delta_t = %d: %d" % (values[ii], counts[ii]))