In [14]:
import numpy as np
import matplotlib.pyplot as plt

Symbols (sympy)

it is possible to use symbols in Python using sympy. It is way more clumsy than Mathematica unfortunately, but still possible.

In [139]:
import sympy
sympy.__version__
Out[139]:
'1.1.1'

some input/output handlers and beautifier

In [130]:
from sympy import latex
from sympy.printing.mathml import mathml
from sympy import mathematica_code as mcode
from IPython.display import display

let us introduce some symbols. A symbol is a variable that is handled by sympy

In [131]:
from sympy.abc import theta, lamda
In [132]:
sympy.init_printing()
In [133]:
display(theta)
$$\theta$$
In [134]:
y = sympy.sin(theta)
In [106]:
display(y)
$$\sin{\left (\theta \right )}$$
In [149]:
print(theta.is_complex)
None

An function with know integral: the exponential

In [150]:
x, l, z = sympy.symbols('x l z',real=True,positive=True)
In [118]:
sympy.abc.lamda
Out[118]:
$$\lambda$$
In [172]:
cdf = sympy.symbols('cdf')
cdf=sympy.integrate(l*sympy.exp(-l*z), (z, 0, x))
In [173]:
display(cdf)
$$1 - e^{- l x}$$
In [174]:
mathml(cdf)
Out[174]:
'<apply><minus/><cn>1</cn><apply><exp/><apply><minus/><apply><times/><ci>l</ci><ci>x</ci></apply></apply></apply></apply>'
In [175]:
latex(cdf)
Out[175]:
'1 - e^{- l x}'
In [176]:
mcode(cdf)
Out[176]:
'1 - Exp[-l*x]'

we evaluate the function cdf with the method subs

In [199]:
%timeit cdf.subs([(x,1.0),(l,1.0)])
95.9 µs ± 542 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [206]:
cdf.subs([(x,1.0),(l,1.0)])
Out[206]:
$$0.632120558828558$$
In [178]:
sol = sympy.symbols('sol')
sol = sympy.solveset(sympy.Eq(cdf,0.19),x,domain=sympy.S.Reals)
print(sol)
{0.210721031315653/l}
In [179]:
sol.subs([(l,2.2)])
Out[179]:
$$\left\{0.0957822869616602\right\}$$
In [181]:
r_values=np.random.rand(5)
In [312]:
r_values
Out[312]:
array([ 0.21554971,  0.23280715,  0.52857267,  0.02492478,  0.17433991])

Solve for $x\in \mathbb{R}$: $$CDF(x)=\int^x_0 pdf(z) dz =\xi$$

In [187]:
def solver(xi,laa):
    return (sympy.solveset(sympy.Eq(cdf, xi ),x,domain=sympy.S.Reals)).subs([(l,laa)])

just a different definition in which I substite $\lambda$ before solving

In [184]:
def solver(xi,laa):
    return (sympy.solveset(sympy.Eq(cdf.subs(l,laa), xi ),x,domain=sympy.S.Reals))
In [193]:
r_values=np.random.rand(5)
[ solver(xi,3.0) for xi in  r_values ];
In [192]:
r_values=np.random.rand(5)
list(map(lambda xi: solver(xi,3.0), r_values)) 
Out[192]:
$$\left [ \left\{0.378874375691698\right\}, \quad \left\{0.145999136103724\right\}, \quad \left\{0.0463692430915799\right\}, \quad \left\{0.540111049640181\right\}, \quad \left\{0.161547902714614\right\}\right ]$$
In [194]:
print(r_values)
[ 0.51349915  0.54612983  0.92323409  0.45485857  0.13175952]

Speedup?

In [201]:
T = cdf.subs([(x,1.0),(l,1.0)])
In [207]:
cdf_lambda_function = sympy.lambdify((x,l), T, "numpy")
In [209]:
cdf_lambda_function((1.0,1.0))
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-209-cb1c4b209fee> in <module>()
----> 1 cdf_lambda_function((1.0,1.0))

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sympy/utilities/lambdify.py in wrapper(*argsx, **kwargsx)
    442                 newargs = [asarray(i) if isinstance(i, integer_types + (float,
    443                     complex)) else i for i in argsx]
--> 444                 return funcarg(*newargs, **kwargsx)
    445             return wrapper
    446         func = array_wrap(func)

TypeError: <lambda>() missing 1 required positional argument: '_Dummy_2348'

Numbers

In [210]:
from scipy import integrate
from scipy.optimize import fsolve
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

Known CDF

In [341]:
def CDF(x,lam):
    return 1 - scipy.exp(-lam*x)
In [342]:
CDF(1,1.2)
Out[342]:
0.69880578808779781
In [343]:
del eq
def eq(x,lam,xi):
    return CDF(x,lam) - xi
    #return x - lam*xi
In [344]:
r_values=np.random.rand(20000)
_exp_r_values = [ scipy.optimize.root(eq, 0.15 , args=(3.0, xii,)) for xii in r_values ]
In [345]:
 exp_r_values = [ sol['x'] for sol in _exp_r_values if sol['success'] ]
In [346]:
vals=np.transpose(np.array(exp_r_values))[0]
In [347]:
bins = np.linspace(0, 3, 15)
counts, binbegend , p  = plt.hist(vals,bins,log=True)

CDF is not known

In [211]:
def PDF(x,lam):
    return lam*scipy.exp(-lam*x)

the integratal of the PDF as function of the end of the range. (integrated with classical numerical mathods)

In [212]:
def CDFi(x,lam):
    return integrate.quad(PDF, 0, x,args=(lam,))
In [213]:
CDFi(1,1.2)
Out[213]:
$$\left ( 0.6988057880877979, \quad 7.758302756764011e-15\right )$$

the equation CDF(x) = xi

$$CDF(x)=\int^x_0 pdf(z) dz =\xi$$

In [216]:
#del eqI
def eqI(x,lam,xi):
    return CDFi(x,lam) - xi
    #return x - lam*xi

Solve for $x\in \mathbb{R}$:

In [352]:
r_values=np.random.rand(20000)
_exp_r_values = [ scipy.optimize.root(eqI, 0.15 , args=(3.0, xii,)) for xii in r_values ]
In [353]:
 exp_r_values = [ sol['x'] for sol in _exp_r_values if sol['success'] ]
In [354]:
len(exp_r_values)
Out[354]:
20000
In [355]:
vals=np.transpose(np.array(exp_r_values))[0]
In [356]:
bins = np.linspace(0, 3, 15)
counts, binbegend , p  = plt.hist(vals,bins,log=True)

CDF is not known

In [857]:
def PDF(x,lam):
    return lam*scipy.exp(-lam*x)

is it normalized on $[0,\infty$]?

In [836]:
integrate.quad(PDF, 0, scipy.inf,args=(3.0,),epsrel=0.001)
Out[836]:
$$\left ( 1.0000000633244484, \quad 0.0005820656200999542\right )$$
In [837]:
def CDFi(x,lam):
    return integrate.quad(PDF, 0, x,args=(lam,))
In [838]:
CDFi(1,1.2)
Out[838]:
$$\left ( 0.6988057880877979, \quad 7.758302756764011e-15\right )$$
In [839]:
del eqI
def eqI(x,lam,xi):
    return CDFi(x,lam) - xi
    #return x - lam*xi
In [842]:
_exp_r_values[0]
Out[842]:
    fjac: array([[-1.]])
     fun: array([ -1.11022302e-16,  -7.77283736e-01])
 message: 'The solution converged.'
    nfev: 10
     qtf: array([  1.88515870e-13])
       r: array([-0.6681488])
  status: 1
 success: True
       x: array([ 0.50061889])
In [840]:
r_values=np.random.rand(20000)
_exp_r_values = [ scipy.optimize.root(eqI, 0.15 , args=(3.0, xii,)) for xii in r_values ]
In [846]:
 exp_r_values = [ sol['x'] for sol in _exp_r_values if sol['success'] ]
In [847]:
len(exp_r_values)
Out[847]:
$$20000$$
In [848]:
vals=np.transpose(np.array(exp_r_values))[0]
In [849]:
bins = np.linspace(0, 3, 15)
counts, binbegend , p  = plt.hist(vals,bins,log=False)

Hit or miss (Cos)

In [863]:
def pdf(x,omega):
    return 1+scipy.cos(-omega*x)

plot the PDF

In [864]:
x=np.linspace(0,1,1000);
y=pdf(x,34);
plt.plot(x,y)
plt.show()
In [865]:
def PDF(x,o):
    normalization=integrate.quad(pdf, 0, 1,args=(o,))
    return pdf(x,o)/normalization[0]
In [866]:
integrate.quad(pdf, 0, 1,args=(34.0,))
Out[866]:
$$\left ( 1.0155612554741182, \quad 1.131659692102621e-14\right )$$
In [867]:
integrate.quad(PDF, 0, 1,args=(34.0,))
Out[867]:
$$\left ( 1.0, \quad 1.1131364685626206e-14\right )$$
In [868]:
x=np.linspace(0,1,1000);
y=PDF(x,34);
plt.plot(x,y)
plt.show()

try to enclose it in a uniform distribution $h(x)=1\,\, for\,\, 0<x<1$

In [869]:
max(y)
Out[869]:
$$1.96935437348$$

it is just a contast function $C\cdot h(x)$ for $C=\max_{[0,1]}(y)$

In [870]:
nP=10**5;
x_values=np.random.rand(nP)
pdf_values=PDF(x_values,34);
u_values=np.random.rand(nP)
In [872]:
sub_set_x_values=x_values[pdf_values-u_values*max(y)>0]
plt.hist(sub_set_x_values,100,normed=True)
plt.show()
print(len(sub_set_x_values))
50956
In [873]:
10**5/max(y)
Out[873]:
$$50778.0627737$$
In [441]:
criterion_value=np.row_stack((pdf_values-u_values*max(y)>0,x_values))
selected_values=np.select([criterion_value[0].astype(bool)],[criterion_value[1]])
plt.hist(selected_values[selected_values>0],100,normed=True)
plt.show()

Hit or miss (Exp)

plot the PDF

In [858]:
x=np.linspace(0,1,1000);
y=PDF(x,1.2);
plt.plot(x,y)
plt.show()

try to enclose it in a uniform distribution $h(x)=1\,\, for\,\, 0<x<1$

In [859]:
max(y)
Out[859]:
$$1.2$$

it is just a contast function $C\cdot h(x)$ for $C=\max_{[0,1]}(y)$

In [248]:
_A=np.array([1,2])
_B=np.array([-1,-2])
print(np.row_stack((_A,_B)))
print(np.column_stack((_A,_B)))
print(np.vstack((_A,_B)))
print(np.hstack((_A,_B)))
[[ 1  2]
 [-1 -2]]
[[ 1 -1]
 [ 2 -2]]
[[ 1  2]
 [-1 -2]]
[ 1  2 -1 -2]
In [249]:
_A*_B
Out[249]:
array([-1, -4])
In [860]:
nP=10**5;
x_values=np.random.rand(nP)
pdf_values=PDF(x_values,1.2);
u_values=np.random.rand(nP)
In [862]:
sub_set_x_values=x_values[pdf_values-u_values*max(y)>0]
plt.hist(sub_set_x_values,100,normed=True)
plt.show()
print(len(sub_set_x_values))
58269
In [343]:
criterion_value=np.row_stack((pdf_values-u_values*max(y)>0,x_values))
selected_values=np.select([criterion_value[0].astype(bool)],[criterion_value[1]])
plt.hist(selected_values[selected_values>0],100,normed=True)
plt.show()

Excercise: redo this for $x \in [0,10]$

Discrete Distributions

In [40]:
values=np.random.choice(bins, 100000)
plt.hist(values)
plt.show()
In [38]:
probabilities=np.random.rand(len(bins))
probabilities=probabilities/np.sum(probabilities)
                             
values=np.random.choice(bins, 100000, p=probabilities)
plt.hist(values)
plt.show()

Discrete (from SciPy)

In [59]:
from scipy import stats
In [69]:
tuple(discrete_pdf)
Out[69]:
(0.0,
 0.0090049905062727486,
 0.025469959406023771,
 0.046791303235619364,
 0.072039924050181989,
 0.10067885454383056,
 0.13234579127384999,
 0.16677475806666719,
 0.20375967524819016,
 0.24313474366936419)
In [70]:
custm = stats.rv_discrete(name='custm', values=(bins, tuple(discrete_pdf)))
In [72]:
custm.pmf(0.2)
Out[72]:
0.0
In [63]:
fig, ax = plt.subplots(1, 1)
xk=bins

ax.plot(xk, custm.pmf(xk), 'ro', ms=12, mec='r')
ax.vlines(xk, 0, custm.pmf(xk), colors='r', lw=4)
plt.show()

Competing Forces

Warm-up

In [345]:
bins=np.arange(0,1,0.001)
discrete_pdf=np.power(bins,0.5)*(1-bins)
discrete_pdf=discrete_pdf/np.sum(discrete_pdf)
#print(bins);
#print(discrete_pdf);
print(np.sum(discrete_pdf))
plt.plot(bins,discrete_pdf)
plt.show()
values=np.random.choice(bins, 100000, p=discrete_pdf)
plt.hist(values,100)
plt.show()
1.0
In [496]:
bins=np.arange(0,1,0.001)
discrete_pdf=np.power(bins,0.5)*np.power((1-bins),2.3)
discrete_pdf=discrete_pdf/np.sum(discrete_pdf)
#print(bins);
#print(discrete_pdf);
print(np.sum(discrete_pdf))
plt.plot(bins,discrete_pdf)
plt.show()
values=np.random.choice(bins, 100000, p=discrete_pdf)
plt.hist(values,100,normed=True)
plt.show()
1.0

try to see its derivative

In [484]:
discrete_diff_pdf=np.diff(discrete_pdf)
#print(discrete_diff_pdf)
plt.plot(discrete_diff_pdf)
plt.show()

and its integral

In [485]:
discrete_diff_pdf=1-np.cumsum(discrete_pdf)
print(find_nearest(discrete_diff_pdf,0.5))
#print(discrete_diff_pdf)
plt.plot(discrete_diff_pdf)
plt.show()
(284, 0.50068259109816515)

a specular behaviour on the other party

In [486]:
bins=np.arange(0,1,0.001)
discrete_pdf=np.power(bins,0.75)*np.power((1-bins),2.3)
discrete_pdf=discrete_pdf/np.sum(discrete_pdf)
bins=1-(bins-1)
#print(bins);
#print(discrete_pdf);
#print(np.sum(discrete_pdf))
plt.plot(bins,discrete_pdf)
plt.show()
values=np.random.choice(bins, 100000, p=discrete_pdf)
plt.hist(values,100)
plt.show()
In [487]:
discrete_diff_pdf=np.cumsum(discrete_pdf)
print(find_nearest(discrete_diff_pdf,0.5))
#print(discrete_diff_pdf)
plt.plot(discrete_diff_pdf)
plt.show()
(324, 0.49913647232260239)

Tentative modeling

  • at each step a buyer and a seller come and make an offer for one unit at a price drawn from distribution of buy and sell orders at that time
  • each bid moves the median buy, each offer moves the median sell (do it for 10 bids)
In [488]:
#find the current median as find_nearest( np.cumsum(pdf_values) ,0.5)
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx,array[idx]
In [541]:
ary=np.array([0,1,2,3])
In [547]:
def midpoint(ary):
    return (ary[1:] + ary[:-1])/2
In [548]:
midpoint(ary)
Out[548]:
array([ 0.5,  1.5,  2.5])

Prototyping

In [825]:
bins=np.arange(0,1,0.001)
binning=np.arange(0,1,0.01)
In [826]:
discrete_pdf=np.power(bins,1)*np.power((1-bins),0.25)
discrete_pdf=discrete_pdf/np.sum(discrete_pdf)
binning=np.sort(1-(binning-1))
bins=1-(bins-1)
In [745]:
discrete_pdf=np.power(bins,11)*np.power((1-bins),1)
discrete_pdf=discrete_pdf/np.sum(discrete_pdf)
In [827]:
#print(bins);
#print(discrete_pdf);
print(np.sum(discrete_pdf))
values=np.random.choice(bins, 100000, p=discrete_pdf)
_counts,_bins, c = plt.hist(values,bins=binning,normed=True)
plt.show()
print(np.average(values))
print(np.median(values))
1.0
1.383838
1.358
In [784]:
pdf_of_counts=_counts/np.sum(_counts)
In [785]:
#median0=find_nearest( np.cumsum(pdf_of_counts) ,0.5)
_new_values=np.random.choice(midpoint(_bins), 10, p=pdf_of_counts)
#print(_new_values)
print(np.average(_new_values))
print(np.median(_new_values))
plt.hist(_new_values)
1.746
1.725
Out[785]:
(array([ 1.,  0.,  2.,  0.,  3.,  1.,  0.,  0.,  1.,  2.]),
 array([ 1.525,  1.567,  1.609,  1.651,  1.693,  1.735,  1.777,  1.819,
         1.861,  1.903,  1.945]),
 <a list of 10 Patch objects>)
In [786]:
new_values=np.append(values,_new_values)
print(np.average(new_values))
print(np.median(new_values))
1.65396069393
1.676
In [787]:
np.average(new_values)-np.average(values)
Out[787]:
$$9.20393060677e-06$$
In [725]:
_new_counts,_bins, c = plt.hist(new_values,bins=binning,normed=True);
new_pdf_of_counts=_new_counts/np.sum(_new_counts)
#median1=find_nearest( np.cumsum(new_pdf_of_counts) ,0.5)
#print(median0,median1)

Function

find the shift of the mean or median (depeding on an optional argument) after nP extractions

In [771]:
def shift_of_metric(metric='average',increment=10,a=0.5,b=2.3, bins=np.arange(0,1,0.001), binning=np.arange(0,1,0.01),opposite=False, interactive=False):
    discrete_pdf=np.power(bins,a)*np.power((1-bins),b)
    discrete_pdf=discrete_pdf/np.sum(discrete_pdf)
    
    if opposite:
        binning=np.sort(1-(binning-1))
        bins=1-(bins-1)
    nRealData=10**4
    values=np.random.choice(bins, nRealData, p=discrete_pdf)
    if interactive:
        _counts,_bins, c = plt.hist(values,bins=binning,normed=True)        # do the histo
        plt.show()
    else:
        _counts,_bins = np.histogram(values,bins=binning,normed=True);
    pdf_of_counts=_counts/np.sum(_counts)
    _new_values=np.random.choice(midpoint(_bins), increment, p=pdf_of_counts)
    #np.average(_new_values)
    #np.median(_new_values)
    new_values=np.append(values,_new_values)
    if metric=="average":
        return np.average(new_values)- np.average(values)
    if metric=="median":
        return np.median(new_values)- np.median(values)

find the distribution of the mean or median shift

In [772]:
shift_of_metric(increment=10,interactive=True)
Out[772]:
$$2.90769230771e-06$$
In [790]:
shift_of_metric(increment=10,interactive=True,opposite=True)
Out[790]:
$$8.37673326675e-05$$
In [794]:
def shifts(increment,trials=1000,opposite=False,a=1,b=1):
    return np.array([ shift_of_metric(a=a,b=b,increment=increment,opposite=opposite) for k in range(trials) ])
In [795]:
[ [batch_size, np.log10(np.sqrt(np.var(shifts(batch_size))))] for batch_size in [10,30,100,300,1000,3000,10000] ]
Out[795]:
$$\left [ \left [ 10, \quad -4.15047654397\right ], \quad \left [ 30, \quad -3.90679446378\right ], \quad \left [ 100, \quad -3.65279212999\right ], \quad \left [ 300, \quad -3.42801052308\right ], \quad \left [ 1000, \quad -3.18234298568\right ], \quad \left [ 3000, \quad -3.04608723414\right ], \quad \left [ 10000, \quad -2.95409871529\right ]\right ]$$
In [796]:
[ [batch_size, np.log10(np.sqrt(np.var(shifts(batch_size,opposite=False))))] for batch_size in [10,30,100,300,1000,3000,10000] ]
Out[796]:
$$\left [ \left [ 10, \quad -4.15610125604\right ], \quad \left [ 30, \quad -3.91573091813\right ], \quad \left [ 100, \quad -3.6692434229\right ], \quad \left [ 300, \quad -3.42823412719\right ], \quad \left [ 1000, \quad -3.18111325087\right ], \quad \left [ 3000, \quad -3.02093932751\right ], \quad \left [ 10000, \quad -2.9528608509\right ]\right ]$$
In [797]:
[ [batch_size, np.log10(np.abs(np.average(shifts(batch_size))))] for batch_size in [10,30,100,300,1000,3000,10000] ]
Out[797]:
$$\left [ \left [ 10, \quad -5.25051891887\right ], \quad \left [ 30, \quad -6.46058314483\right ], \quad \left [ 100, \quad -5.73999500636\right ], \quad \left [ 300, \quad -4.64027629562\right ], \quad \left [ 1000, \quad -4.75416576826\right ], \quad \left [ 3000, \quad -3.9796751226\right ], \quad \left [ 10000, \quad -3.67681506077\right ]\right ]$$
In [798]:
[ [batch_size, np.log10(np.abs(np.average(shifts(batch_size,opposite=False))))] for batch_size in [10,30,100,300,1000,3000,10000] ]
Out[798]:
$$\left [ \left [ 10, \quad -6.32333361027\right ], \quad \left [ 30, \quad -5.18709814787\right ], \quad \left [ 100, \quad -5.02069359842\right ], \quad \left [ 300, \quad -5.65106501739\right ], \quad \left [ 1000, \quad -4.51583584224\right ], \quad \left [ 3000, \quad -4.22952610802\right ], \quad \left [ 10000, \quad -3.7244421928\right ]\right ]$$
In [804]:
plt.clf()
pips_range=np.linspace(-0.001,0.001,22)
plt.hist(shifts(10),normed=True,bins=pips_range)
plt.hist(shifts(30),normed=True,color='cyan',alpha=0.7,bins=pips_range)
plt.hist(shifts(100),normed=True,color='green',alpha=0.3,bins=pips_range)
#plt.hist(shifts(300),normed=True,color='cyan',alpha=0.7)
#plt.hist(shifts(3000),normed=True,color='green',alpha=0.7)
#plt.hist(shifts(10000),normed=True,color='red',alpha=0.7)

plt.show()
In [828]:
def movement(increment=10,a=1,b=1):
    return shifts(increment,a=a,b=b)-shifts(increment,a=a,b=b,opposite=True)
In [833]:
plt.clf()
_A=0.4
_B=0.02
pips_range=np.linspace(-0.001,0.001,22)
plt.hist(movement(10,a=_A,b=_B),normed=True,bins=pips_range)
plt.hist(movement(30,a=_A,b=_B),normed=True,color='cyan',alpha=0.7,bins=pips_range)
plt.hist(movement(100,a=_A,b=_B),normed=True,color='green',alpha=0.3,bins=pips_range)
#plt.hist(shifts(300),normed=True,color='cyan',alpha=0.7)
#plt.hist(shifts(3000),normed=True,color='green',alpha=0.7)
#plt.hist(shifts(10000),normed=True,color='red',alpha=0.7)
del _A,_B
plt.show()
In [832]:
plt.clf()
_A=0.4
_B=1.8
pips_range=np.linspace(-0.001,0.001,22)
plt.hist(movement(10,a=_A,b=_B),normed=True,bins=pips_range)
plt.hist(movement(30,a=_A,b=_B),normed=True,color='cyan',alpha=0.7,bins=pips_range)
plt.hist(movement(100,a=_A,b=_B),normed=True,color='green',alpha=0.3,bins=pips_range)
#plt.hist(shifts(300),normed=True,color='cyan',alpha=0.7)
#plt.hist(shifts(3000),normed=True,color='green',alpha=0.7)
#plt.hist(shifts(10000),normed=True,color='red',alpha=0.7)
del _A,_B
plt.show()