# 数値演算、描画用ライブラリ読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 日本語表示用ライブラリ（matplotlibで日本語を使用したい場合）
import japanize_matplotlib

# グラフをinline表示可能にする
%matplotlib inline
# 解像度を上げてinline表示する
%config InlineBackend.figure_format = 'retina'


# excelファイルの直接読み込み（シート名指定）
url = 'https://www.ces-alpha.org/course/file_serve/5697111496589312/dsa_data1.xlsx'
df4 = pd.read_excel(url, sheet_name='dsa_04', header=0)
df4.head()


# Pandasのデータ形式をnumpyの形式に変換
kokugo = df4['国語'].to_numpy()
math = df4['数学'].to_numpy()
rika = df4['理科'].to_numpy()

syakai = df4['社会'].to_numpy()
eigo = df4['英語'].to_numpy()


# 点数のリスト
#subjects = [kokugo, math, rika, syakai, eigo]
subjects = [df4['国語'], df4['数学'], df4['理科'], df4['社会'], df4['英語']]

# 箱ひげ図
plt.boxplot(subjects)
plt.xticks([1,2,3,4,5], ('国語', '数学', '理科', '社会', '英語'))

plt.title('科目別得点分布', fontsize=14)
plt.xlabel('科目', fontsize=12)
plt.ylabel('得点分布', fontsize=12)
plt.ylim([0,105])
plt.grid(False)


# 点数のリスト
subjects = [df4['国語'], df4['数学'], df4['理科'], df4['社会'], df4['英語']]

plt.title('科目別得点分布', fontsize=18)
plt.xlabel('科目', fontsize=16)
plt.ylabel('点数', fontsize=16)

# 外れ値用マーカー設定
flierprops = dict(marker="o", markerfacecolor='red', markersize=8, markeredgecolor='none')

# 胴体設定
plt.gca().boxplot(subjects,
                  patch_artist=True,
                  flierprops = flierprops,
                  boxprops={'facecolor': 'pink'}, 
                  medianprops={'color': 'k', 'linewidth':1}
                 )
 
plt.xticks([1,2,3,4,5], ('国語', '数学', '理科', '社会', '英語'))
plt.ylim(0,105);


df4['国語'].median(), df4['数学'].median(), df4['理科'].median(), df4['社会'].median(), df4['英語'].median()

(66.0, 70.5, 65.0, 76.0, 73.5)


# numpy.array型に median() はない！
#df4['国語'].to_numpy().median()


# numpy array の持つコマンド一覧
print(type(df4['国語'].to_numpy()))
print(dir(df4['国語'].to_numpy()))

<class 'numpy.ndarray'>
['T', '__abs__', '__add__', '__and__', '__array__', '__array_finalize__', '__array_function__', '__array_interface__', '__array_prepare__', '__array_priority__', '__array_struct__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__complex__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dir__', '__divmod__', '__doc__', '__eq__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__ilshift__', '__imatmul__', '__imod__', '__imul__', '__index__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__irshift__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lshift__', '__lt__', '__matmul__', '__mod__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rlshift__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__xor__', 'all', 'any', 'argmax', 'argmin', 'argpartition', 'argsort', 'astype', 'base', 'byteswap', 'choose', 'clip', 'compress', 'conj', 'conjugate', 'copy', 'ctypes', 'cumprod', 'cumsum', 'data', 'diagonal', 'dot', 'dtype', 'dump', 'dumps', 'fill', 'flags', 'flat', 'flatten', 'getfield', 'imag', 'item', 'itemset', 'itemsize', 'max', 'mean', 'min', 'nbytes', 'ndim', 'newbyteorder', 'nonzero', 'partition', 'prod', 'ptp', 'put', 'ravel', 'real', 'repeat', 'reshape', 'resize', 'round', 'searchsorted', 'setfield', 'setflags', 'shape', 'size', 'sort', 'squeeze', 'std', 'strides', 'sum', 'swapaxes', 'take', 'tobytes', 'tofile', 'tolist', 'tostring', 'trace', 'transpose', 'var', 'view']


# pandas series の持つコマンド一覧
print(type(df4['国語']))
print(dir(df4['国語']))

<class 'pandas.core.series.Series'>
['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__long__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_add_numeric_operations', '_agg_by_level', '_agg_examples_doc', '_agg_see_also_doc', '_align_frame', '_align_series', '_arith_method', '_as_manager', '_attrs', '_binop', '_cacher', '_can_hold_na', '_check_inplace_and_allows_duplicate_labels', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_axes_from_arguments', '_construct_result', '_constructor', '_constructor_expanddim', '_convert', '_convert_dtypes', '_data', '_dir_additions', '_dir_deletions', '_drop_axis', '_drop_labels_or_levels', '_duplicated', '_find_valid_index', '_flags', '_from_mgr', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cacher', '_get_cleaned_column_resolvers', '_get_index_resolvers', '_get_label_or_level_values', '_get_numeric_data', '_get_value', '_get_values', '_get_values_tuple', '_get_with', '_gotitem', '_hidden_attrs', '_index', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_init_dict', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_cached', '_is_copy', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_item_cache', '_ixs', '_logical_func', '_logical_method', '_map_values', '_maybe_update_cacher', '_memory_usage', '_metadata', '_mgr', '_min_count_stat_function', '_name', '_needs_reindex_multi', '_protect_consolidate', '_reduce', '_reindex_axes', '_reindex_indexer', '_reindex_multi', '_reindex_with_indexers', '_replace_single', '_repr_data_resource_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_set_as_cached', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_labels', '_set_name', '_set_value', '_set_values', '_set_with', '_set_with_engine', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_stat_function', '_stat_function_ddof', '_take_with_is_copy', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'backfill', 'between', 'between_time', 'bfill', 'bool', 'clip', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'iteritems', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'set_flags', 'shape', 'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'std', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs']


np.percentile(df4['数学'], 50),  df4['数学'].median()

(70.5, 70.5)


# 四分位数の取得
print('第1四分位数は', np.percentile(df4['数学'], 25), 'です')
print('中央値は', df4['数学'].median(), 'です')
print('第3四分位数は', np.percentile(df4['数学'], 75), 'です')

第1四分位数は 58.25 です
中央値は 70.5 です
第3四分位数は 83.75 です


# Web上のExcelファイルを直接読み込み
url = 'https://www.ces-alpha.org/course/file_serve/5938265186107392/dji_w.xlsx'
df_dow = pd.read_excel(url)
df_dow


# Pandas と matplotlib間のdatetime型の変換（おまじない）
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# グラフタイトル
plt.title('ダウ工業株30種平均 (1896-2021)', fontsize=16)

# グラフの軸名
plt.xlabel('Year', fontsize=14)
plt.ylabel('株価 (USD)', fontsize=14)

# グラフの塗りを指定
plt.fill_between(df_dow['Date'], df_dow["Close"], facecolor='gray', alpha=0.3, interpolate=True)
plt.plot(df_dow['Date'], df_dow["Close"], c="k", lw=1.5)

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定


# グラフタイトル
plt.title('ダウ工業株30種平均 (1896-2021)', fontsize=16)

# グラフの軸名
plt.xlabel('Year', fontsize=14)
plt.ylabel('株価（USD: 対数目盛）', fontsize=14)
plt.grid(which='both', axis='both')

# グラフの塗りを指定 d = data['Date'].values
plt.fill_between(df_dow["Date"], df_dow["Close"], facecolor='gray', alpha=0.3, interpolate=True)
plt.yscale("log")

# これまでに倣って、to_numpy() を付加してもよい
plt.plot(df_dow['Date'], df_dow["Close"], c="k", lw=1.5)

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定


import datetime
# グラフタイトル
plt.title('ダウ工業株30種平均 (2000-2010)', fontsize=16)

# グラフの軸名
plt.xlabel('Year', fontsize=14)
plt.ylabel('株価（USD）', fontsize=14)
plt.grid(which='both', axis='both')

# グラフの塗りを指定
plt.fill_between(df_dow["Date"], df_dow["Close"], facecolor='gray', alpha=0.3, interpolate=True)
plt.plot(df_dow['Date'], df_dow["Close"], c="k", lw=1.5)

# 時系列データの期間指定
plt.xlim(datetime.datetime(2000, 1, 1), datetime.datetime(2010, 1, 1))
plt.ylim(0, 16000)

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定


df_dow["Close"].to_numpy()

array([   48.31,    47.03,    47.29, ..., 33811.4 , 32977.21, 32899.37])


df_dow["Close"]

0          48.31
1          47.03
2          47.29
3          46.52
4          49.15
          ...   
6359    34721.12
6360    34451.23
6361    33811.40
6362    32977.21
6363    32899.37
Name: Close, Length: 6364, dtype: float64


import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker

# グラフタイトル
plt.title('ダウ工業株30種平均 (2000-2020)', fontsize=16)

# グラフの軸名
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('株価 (USD)', fontsize=14)

# 時系列軸（x軸）の間隔&フォーマット指定
# gca()で一時的にmatplotlibのオブジェクト指向流儀での設定にアクセスして操作
plt.gca().xaxis.set_major_locator(mdates.YearLocator(5, month=1, day=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y\n%m'))

# （おまけ）y軸の間隔を4000に
#plt.gca().yaxis.set_major_locator(ticker.MultipleLocator(4000))

# 時系列データの期間指定
plt.xlim(datetime.datetime(2000, 1, 1), datetime.date.today())
plt.plot(df_dow['Date'], df_dow['Close'], c="k", lw=1.5)

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定


# グラフタイトル
plt.title('ダウ工業株30種平均 (2020-2022)', fontsize=16)

# グラフの軸名
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('株価 (USD)', fontsize=14)

# 時系列データの期間指定
plt.xlim(datetime.datetime(2020, 1, 1), datetime.date.today())

plt.plot(df_dow['Date'], df_dow['Close'], c="k", lw=1.5)
plt.grid()

# MonthLocator(bymonth=None, bymonthday=1, interval=1, tz=None)
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(bymonth=[1,4,7,10]))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y\n%m'))

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定

# 特定領域を塗る
start_datetime = datetime.datetime(2020,3,1)
end_datetime = datetime.datetime(2020,5,1)
plt.axvspan(start_datetime, end_datetime, color="pink", alpha=0.3)

# 指定した位置に注釈を入れる
plt.annotate("COVID-19", (datetime.datetime(2020,4,1), 19000), (datetime.datetime(2020,5,1), 12000),
             arrowprops=dict(arrowstyle="->"), fontsize="14");


# グラフタイトル
plt.title('ダウ工業株30種平均 (2021/07-2022/5)', fontsize=16)

# グラフの軸名
plt.xlabel('Datetime', fontsize=14)
plt.ylabel('株価 (USD)', fontsize=14)

# 時系列データの期間指定
plt.xlim(datetime.datetime(2021, 6, 30), datetime.date.today())
plt.ylim(32000, 37000)
plt.grid()
plt.plot(df_dow['Date'], df_dow['Close'], c="k", lw=1.5)

# WeekdayLocator(byweekday=1, interval=1, tz=None)
# WeekdayLocator(interval=5)
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=1, interval=5, tz=None))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y\n%m/%d'))

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定


# グラフタイトル
plt.title('ダウ工業株30種平均 (2022/4/1-2022/5/13)', fontsize=14)

# グラフの軸名
plt.xlabel('Datetime', fontsize=14)
plt.ylabel('株価 (USD)', fontsize=14)

# 時系列データの期間指定
plt.xlim(datetime.datetime(2022, 3, 1), datetime.datetime(2022, 5, 10))
plt.ylim(32000, 37000)
plt.plot(df_dow['Date'], df_dow['Close'], color="k", lw=1.5)

# DayLocator(bymonthday=None, interval=1, tz=None)
# DayLocator(bymonthday=[1,10,20])
plt.gca().xaxis.set_major_locator(mdates.DayLocator(bymonthday=[1,10,20]))
#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=5))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y\n%m/%d'))

# 軸メモリの回転
plt.xticks(rotation=20);    # 回転角度指定


df_dow['Date'] > datetime.datetime(2010,1,1)

0       False
1       False
2       False
3       False
4       False
        ...  
6359     True
6360     True
6361     True
6362     True
6363     True
Name: Date, Length: 6364, dtype: bool


term1 = df_dow['Date'] > datetime.datetime(2020,12,1)
term2 = df_dow['Date'] < datetime.datetime(2021,3,1)
df_dow[term1 & term2]


mylist = np.array([12,5,16,3,9,17])
mylist

array([12,  5, 16,  3,  9, 17])


mylist > 5

array([ True, False,  True, False,  True,  True])


mylist[mylist > 5]

array([12, 16,  9, 17])


cond1 = (mylist > 5)
cond1

array([ True, False,  True, False,  True,  True])


cond2 = (mylist <= 16)


cond = cond1 & cond2
cond

array([ True, False,  True, False,  True, False])

	Date	Open	High	Low	Close
0	1900-01-07	49.34	49.34	48.24	48.31
1	1900-01-14	48.10	48.10	45.82	47.03
2	1900-01-21	46.51	47.29	46.51	47.29
3	1900-01-28	47.34	47.34	46.52	46.52
4	1900-02-04	46.68	49.15	46.68	49.15
...	...	...	...	...	...
6359	2022-04-10	34799.98	35112.21	34190.95	34721.12
6360	2022-04-17	34630.27	34889.17	34102.81	34451.23
6361	2022-04-24	34411.49	35521.00	33773.39	33811.40
6362	2022-05-01	33731.65	34106.01	32913.15	32977.21
6363	2022-05-08	32978.49	34117.74	32449.87	32899.37

	Date	Open	High	Low	Close
6289	2020-12-06	29854.51	30218.26	29463.64	30218.26
6290	2020-12-13	30233.03	30319.70	29820.84	30046.37
6291	2020-12-20	30123.91	30343.59	29849.15	30179.05
6292	2020-12-27	30159.00	30304.14	29755.53	30199.87
6293	2021-01-03	30283.23	30637.47	30274.24	30606.48
6294	2021-01-10	30627.47	31193.40	29881.82	31097.97
6295	2021-01-17	31015.37	31223.78	30612.67	30814.26
6296	2021-01-24	30887.42	31272.22	30865.03	30996.98
6297	2021-01-31	30989.85	31121.42	29856.30	29982.62
6298	2021-02-07	30054.73	31252.18	30014.97	31148.24
6299	2021-02-14	31191.20	31543.82	31191.20	31458.40
6300	2021-02-21	31472.08	31647.53	31285.32	31494.32
6301	2021-02-28	31381.12	32009.64	30911.37	30932.37

DSA 2022/5/19¶

箱ひげ図の描画¶

5. 時系列データの描画¶

¶

時系列軸（x軸）の間隔&フォーマット指定¶

日付＆時間フォーマット¶

(1) YearLocator¶

(2) MonthLocator¶

(3) WeekdayLocator¶

(4) DayLocator¶

	国語	数学	理科	社会	英語
0	94	76	62	81	62
1	63	57	59	80	74
2	55	66	82	78	66
3	61	87	80	65	78
4	56	58	69	97	61