Contents

import numpy as np
import matplotlib.pyplot as pl
%matplotlib inline
data = np.loadtxt('../data/sizedistribution.dat');
vals = data[:,1]
num = data[:,0]
pl.plot(num,vals)
[<matplotlib.lines.Line2D at 0x761bfa5eb6b0>]
../_images/640eb091ac5e3c27818c1a57b6d5b94aee5bf40a696457cb0570163357888dc6.png

Isn’t it clear that the plot is not like “random variable” ?

pl.hist(vals,21)
(array([565., 255.,  81.,  39.,  21.,  13.,  10.,   3.,   2.,   3.,   2.,
          3.,   1.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   1.]),
 array([  1.2073    ,  23.72266667,  46.23803333,  68.7534    ,
         91.26876667, 113.78413333, 136.2995    , 158.81486667,
        181.33023333, 203.8456    , 226.36096667, 248.87633333,
        271.3917    , 293.90706667, 316.42243333, 338.9378    ,
        361.45316667, 383.96853333, 406.4839    , 428.99926667,
        451.51463333, 474.03      ]),
 <BarContainer object of 21 artists>)
../_images/8a1e4950a12776019eb143cf1c6c461d2fada8c05fc514971b47fe2681cd39d0.png
pl.hist(np.log(vals),21)
(array([  1.,   4.,  20.,  23.,  35.,  48.,  67.,  97., 104., 107., 113.,
        119.,  95.,  57.,  45.,  26.,  21.,   6.,   9.,   1.,   2.]),
 array([0.18838646, 0.47280952, 0.75723257, 1.04165563, 1.32607868,
        1.61050174, 1.89492479, 2.17934784, 2.4637709 , 2.74819395,
        3.03261701, 3.31704006, 3.60146312, 3.88588617, 4.17030923,
        4.45473228, 4.73915534, 5.02357839, 5.30800145, 5.5924245 ,
        5.87684756, 6.16127061]),
 <BarContainer object of 21 artists>)
../_images/18158969c949291de046885ce93a38e26be7686fa02decf923e17c7e7e974b33.png
newvals = np.log(vals)
pl.plot(num,newvals)
[<matplotlib.lines.Line2D at 0x761bfa5c5f70>]
../_images/003467e8d56a43a6ac2236128bcb6f73feb5fd62628e2863b0884612e9bb4e19.png
print("Wrong:")
print ("Average =  %.3f" %np.mean(vals))
print ("Standard Deviation = %.3f" %np.std(vals))

print ("Correct:")
print ("Average =  %.3f" % np.exp(np.mean(newvals)))
print ("Standard Deviation = %.3f" % np.exp(np.std(newvals,ddof=1)))
Wrong:
Average =  32.048
Standard Deviation = 39.773
Correct:
Average =  19.980
Standard Deviation = 2.637
from scipy.stats import lognorm, norm
param = norm.fit(newvals)
x = np.linspace(np.min(newvals),np.max(newvals),100)
# fitted distribution
pdf_fitted = norm.pdf(x,loc=param[0],scale=param[1])
pl.figure()
pl.plot(x,pdf_fitted,'r-')
pl.hist(newvals,density=1,alpha=.3)
(array([0.0100454 , 0.08036319, 0.15737791, 0.30471041, 0.39679323,
        0.38842207, 0.20927913, 0.08873435, 0.03348466, 0.0050227 ]),
 array([0.18838646, 0.78567488, 1.38296329, 1.98025171, 2.57754012,
        3.17482854, 3.77211695, 4.36940537, 4.96669378, 5.5639822 ,
        6.16127061]),
 <BarContainer object of 10 artists>)
../_images/c4daeb17b4e3a6d205f1819336180c2dc9a5b55b77e756fe6b9875b3b41b3fba.png
print(param) # log normal
print(np.exp(param))
(np.float64(2.9947081784485516), np.float64(0.9690344007390227))
[19.97952858  2.63539849]