import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time, random
import japanize_matplotlib
from numpy.random import randint
from scipy.stats import norm
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


# 男子高校生100人の体重データ：単位 [kg]
weight = np.array([43.6, 45.2, 45.4, 45.8, 47.2, 47.8, 48.2, 48.7, 48.8, 48.9, 49., 49., 49.4, 49.5, 49.8,
                   50.4, 50.5, 50.9, 50.9, 51.2, 51.2, 51.2, 51.3, 51.3, 51.6, 51.7, 51.7, 51.8, 52., 52.,
                   52.1, 52.1, 52.1, 52.2, 52.3, 52.7, 52.7, 52.8, 52.9, 52.9, 53.1, 53.1, 53.8, 54.,  54.5,
                   54.5, 54.6, 54.7, 54.7, 54.7, 54.8, 54.9, 55.1, 55.1, 55.2, 55.3, 55.4, 55.4, 55.4, 55.6,
                   55.7, 55.8, 55.9, 56.1, 56.3, 56.3, 56.3, 56.4, 56.5, 56.7, 56.8, 57., 57.1, 57.1, 57.2,
                   57.3, 57.6, 57.7, 57.8, 58.1, 58.4, 58.6, 58.7, 58.7, 58.7, 58.7, 59.1, 59.3, 59.9, 60.,
                   60.1, 60.3, 60.5, 60.6, 60.6, 60.7, 61.3, 62.7, 64.2, 64.6])
weight

array([43.6, 45.2, 45.4, 45.8, 47.2, 47.8, 48.2, 48.7, 48.8, 48.9, 49. ,
       49. , 49.4, 49.5, 49.8, 50.4, 50.5, 50.9, 50.9, 51.2, 51.2, 51.2,
       51.3, 51.3, 51.6, 51.7, 51.7, 51.8, 52. , 52. , 52.1, 52.1, 52.1,
       52.2, 52.3, 52.7, 52.7, 52.8, 52.9, 52.9, 53.1, 53.1, 53.8, 54. ,
       54.5, 54.5, 54.6, 54.7, 54.7, 54.7, 54.8, 54.9, 55.1, 55.1, 55.2,
       55.3, 55.4, 55.4, 55.4, 55.6, 55.7, 55.8, 55.9, 56.1, 56.3, 56.3,
       56.3, 56.4, 56.5, 56.7, 56.8, 57. , 57.1, 57.1, 57.2, 57.3, 57.6,
       57.7, 57.8, 58.1, 58.4, 58.6, 58.7, 58.7, 58.7, 58.7, 59.1, 59.3,
       59.9, 60. , 60.1, 60.3, 60.5, 60.6, 60.6, 60.7, 61.3, 62.7, 64.2,
       64.6])


# 基本グラフ
plt.title("とあるクラス男子の体重分布", fontsize=18)
plt.xlabel("体重 [kg]", fontsize=14)
plt.ylabel("生徒数", fontsize=14)

hist_ini = 43; hist_fin = 65; hist_step = 2
bins = np.arange(hist_ini, hist_fin+1, hist_step)  # 終点は含まないので＋1としている
dosu, cls = np.histogram(weight, bins)

my_ticks = np.arange(hist_ini, hist_fin, hist_step)
plt.bar(my_ticks, dosu, align='edge',color="y", ec="black", width=2)
plt.xticks(np.append(my_ticks,hist_fin))
plt.yticks(np.arange(0,21,5))
plt.text(42.5, 20, f"n={dosu.sum()}", fontsize=16);


print(f'データ数は、{len(weight)} 件です。')
print(f'体重の平均値は、{weight.mean():.2f} [kg] です。')
print(f'体重の標準偏差は、{weight.std(ddof=0):.2f} [kg] です。')
print(f'体重の中央値(median)は、{np.median(weight):.2f} [kg] です。')
print(f'体重の第1四分位数は、{np.percentile(weight, 25):.2f} [kg] です。')
print(f'体重の第3四分位数は、{np.percentile(weight, 75):.2f} [kg] です。')
print(f'体重の四分位範囲は、{np.percentile(weight, 75) - np.percentile(weight, 25):.2f} [kg] です。')

データ数は、100 件です。
体重の平均値は、54.46 [kg] です。
体重の標準偏差は、4.22 [kg] です。
体重の中央値(median)は、54.75 [kg] です。
体重の第1四分位数は、51.68 [kg] です。
体重の第3四分位数は、57.23 [kg] です。
体重の四分位範囲は、5.55 [kg] です。


trial_num = 10
R = randint(1,7,trial_num)
sa = np.setdiff1d(np.array(range(1,7)), R)

labels, counts = np.unique(R, return_counts=True)
labels = np.append(labels,sa)
counts = np.append(counts,np.zeros(len(sa)))

plt.bar(labels, counts, align='center')
plt.gca().set_xticks(labels)
plt.xlabel('x（出た目）', fontsize=14)
plt.ylabel('度数', fontsize=14)
print(R)
print(labels, counts)
print(f'サイコロを {trial_num:,} 回振って出た目の平均値は {R.mean():.2f} です。')

[6 3 2 2 5 1 4 5 1 3]
[1 2 3 4 5 6] [2. 2. 2. 1. 2. 1.]
サイコロを 10 回振って出た目の平均値は 3.20 です。


trial_num = 10   # ここを例えば1000　などに変更し、挙動を確認してみよ。
ave = []

for _ in range(10):
    R = randint(1,7,trial_num)
    print(f'サイコロを {trial_num:,} 回振って出た目の平均値は {R.mean():.3f} です。')
    ave.append(R.mean())
    time.sleep(0.5)
foo = np.array(ave)
print(foo)
print(f'\n上のようにサイコロを {trial_num:,} 回振って出た目の平均を取る操作を繰り返したとき、その平均値（平均の平均のこと）は {foo.mean():.4f} で、バラつき(＝標準偏差）は {foo.std():.4f} です。')

サイコロを 10 回振って出た目の平均値は 2.900 です。
サイコロを 10 回振って出た目の平均値は 3.200 です。
サイコロを 10 回振って出た目の平均値は 3.200 です。
サイコロを 10 回振って出た目の平均値は 3.300 です。
サイコロを 10 回振って出た目の平均値は 3.700 です。
サイコロを 10 回振って出た目の平均値は 3.600 です。
サイコロを 10 回振って出た目の平均値は 3.100 です。
サイコロを 10 回振って出た目の平均値は 3.800 です。
サイコロを 10 回振って出た目の平均値は 3.500 です。
サイコロを 10 回振って出た目の平均値は 2.900 です。
[2.9 3.2 3.2 3.3 3.7 3.6 3.1 3.8 3.5 2.9]

上のようにサイコロを 10 回振って出た目の平均を取る操作を繰り返したとき、その平均値（平均の平均のこと）は 3.3200 で、バラつき(＝標準偏差）は 0.3027 です。


sample_num = 2
dice = np.array(range(1,7))
trial_array=[]
repeat = 10000
for i in range(repeat): #　以下をrepeat回繰り返す
    d = random.choices(dice, k=sample_num) #復元抽出
    # 出た目の和
    sum_1sample = np.sum(d)
    trial_array.append(sum_1sample)

plt.xlabel("出た目の和（or 平均 $\overline{X}$）", fontsize=14)
plt.ylabel("度数", fontsize=14)
plt.hist(trial_array, bins=200, facecolor='c');


center = 0; sigma = 1
z_pos = 1

#Both sides
x1 = np.linspace(center - 4*sigma, center + 4*sigma, 2**8)
y1 = norm.pdf(x1, loc=center, scale=sigma)
x2 = np.linspace(center - z_pos*sigma, center + z_pos*sigma, 2**8)
y2 = norm.pdf(x2, loc=center, scale=sigma)

plt.plot(x1, y1, color="c")
plt.plot(x2, y2, color="c")
plt.fill_between(x2, 0, y2, facecolor='c', alpha=0.3)

plt.vlines(x=-z_pos, ymin=-0, ymax=norm.pdf(z_pos,loc=center,scale=sigma),
           linestyles='dashed', linewidths=1)
plt.vlines(x=z_pos, ymin=-0, ymax=norm.pdf(z_pos,loc=center,scale=sigma),
           linestyles='dashed', linewidths=1)

plt.xlabel("$\sigma$ (standard deviation)", fontsize=14)
plt.ylabel("Probability", fontsize=14)
prob = 2*norm.cdf(x=z_pos, loc=center, scale=sigma)-1
print(f"区間 [{-z_pos: .1f}*sigma,{z_pos: .1f}*sigma]に入る確率:{prob: .3f}")

区間 [-1.0*sigma, 1.0*sigma]に入る確率: 0.683


# 正規分布表の生成（表5.1）
z = [k for k in np.arange(0,4,0.1)]
for k in z:
    print(f"{k:.2f}  {norm.cdf(k, loc=0, scale=1):.6f}")

0.00  0.500000
0.10  0.539828
0.20  0.579260
0.30  0.617911
0.40  0.655422
0.50  0.691462
0.60  0.725747
0.70  0.758036
0.80  0.788145
0.90  0.815940
1.00  0.841345
1.10  0.864334
1.20  0.884930
1.30  0.903200
1.40  0.919243
1.50  0.933193
1.60  0.945201
1.70  0.955435
1.80  0.964070
1.90  0.971283
2.00  0.977250
2.10  0.982136
2.20  0.986097
2.30  0.989276
2.40  0.991802
2.50  0.993790
2.60  0.995339
2.70  0.996533
2.80  0.997445
2.90  0.998134
3.00  0.998650
3.10  0.999032
3.20  0.999313
3.30  0.999517
3.40  0.999663
3.50  0.999767
3.60  0.999841
3.70  0.999892
3.80  0.999928
3.90  0.999952


# 標準正規分布の任意のパーセント点の算出 (norm.ppf)
# 例：64％点（-∞からの面積が0．64となる点のx座標）
z = 0.64
print(f"{z*100:.1f}%点： {norm.ppf(z, loc=0, scale=1):.6f}")

64.0%点： 0.358459

データサイエンス実践B（2022年度第2ターム）¶

第１講記述統計の復習¶

代表的な統計量（全データが既知なので、正確に計算可能）¶

統計学がうまく行く理由（つまるところ、）¶

サイコロ実験¶

平均値のバラつきのサンプルサイズ依存性¶

中心極限定理¶

仮説・検定のココロ¶

データサイエンス実践B（2022年度 第2ターム）¶

第１講 記述統計の復習¶

代表的な統計量（全データが既知なので、正確に計算可能）¶

統計学がうまく行く理由（つまるところ、）¶

サイコロ実験¶

平均値のバラつきのサンプルサイズ依存性¶

中心極限定理¶

仮説・検定のココロ¶

データサイエンス実践B（2022年度第2ターム）¶

第１講記述統計の復習¶