1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
| import numpy as np import pandas as pd import hashlib
import os from rich.pretty import pprint
np.random.seed(42)
size = 10000
df = pd.DataFrame() df["A"] = np.random.rand(size) df["B"] = np.random.randint(0, 10000, size) df["sin"] = df["A"].apply(lambda x: np.sin(x)) df["datetime"] = pd.date_range("20240101", periods=size, freq="s") df["array"] = df["B"].apply(lambda x: np.sin(x) * np.array([1, 2, 3])) df["string"] = df["array"].apply(lambda x: hashlib.md5(str(x).encode("utf8")).hexdigest())
print(df) print(df.dtypes)
import timeit
XLSX = "test.xlsx" CSV = "test.csv" PKL = "test.pkl" PKLZ = "test.pklz" HDF = "test.hdf" HDFZ = "test.hdfz" FTH = "test.feather" PQT = "test.parquet"
funcs = [ lambda x: df.to_excel(XLSX), lambda x: pd.read_excel(XLSX, index_col=0), lambda x: df.to_csv(CSV, mode="w"), lambda x: pd.read_csv(CSV, index_col=0), lambda x: df.to_pickle(PKL), lambda x: pd.read_pickle(PKL), lambda x: df.to_pickle(PKLZ, compression="xz"), lambda x: pd.read_pickle(PKLZ, compression="xz"), lambda x: df.to_hdf(HDF, key="test", mode="w"), lambda x: pd.read_hdf(HDF, "test"), lambda x: df.to_hdf(HDFZ, key="test", mode="w", complib="blosc"), lambda x: pd.read_hdf(HDFZ, "test"), lambda x: df.to_feather(FTH), lambda x: pd.read_feather(FTH), lambda x: df.to_parquet(PQT), lambda x: pd.read_parquet(PQT) ]
runs_number = 1 runs_result = []
for i, name in enumerate([XLSX, CSV, PKL, PKLZ, HDF, HDFZ, FTH, PQT]): func = name.replace("test.", "") w_speed = timeit.timeit(lambda: funcs[2 * i](df), number=runs_number) r_speed = timeit.timeit(lambda: funcs[2 * i + 1](df), number=runs_number) filesize = os.path.getsize(name) / 1024 / 1024 runs_result.append([func, w_speed, r_speed, filesize])
pprint(runs_result)
df_rst = pd.DataFrame(runs_result, columns=["func", "write [s]", "read [s]", "filesize [M]"]) pprint(df_rst)
df_rst.plot(x='func', y=['write [s]', 'read [s]', 'filesize [M]'], kind='bar', title=f'datasize = {size}')
|