""" self-contained to write legacy storage pickle files To use this script. Create an environment where you want generate pickles, say its for 0.20.3, with your pandas clone in ~/pandas . activate pandas_0.20.3 cd ~/ $ python pandas/pandas/tests/io/generate_legacy_storage_files.py \ pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ pickle This script generates a storage file for the current arch, system, and python version pandas version: 0.20.3 output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ storage format: pickle created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle The idea here is you are using the *current* version of the generate_legacy_storage_files with an *older* version of pandas to generate a pickle file. We will then check this file into a current branch, and test using test_pickle.py. This will load the *older* pickles and test versus the current data that is generated (with master). These are then compared. If we have cases where we changed the signature (e.g. we renamed offset -> freq in Timestamp). Then we have to conditionally execute in the generate_legacy_storage_files.py to make it run under the older AND the newer version. """ from datetime import timedelta from distutils.version import LooseVersion import os import pickle import platform as pl import sys import numpy as np import pandas from pandas import ( Categorical, DataFrame, Index, MultiIndex, NaT, Period, RangeIndex, Series, Timestamp, bdate_range, date_range, period_range, timedelta_range, ) from pandas.tseries.offsets import ( FY5253, BusinessDay, BusinessHour, CustomBusinessDay, DateOffset, Day, Easter, Hour, LastWeekOfMonth, Minute, MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, SemiMonthBegin, SemiMonthEnd, Week, WeekOfMonth, YearBegin, YearEnd, ) try: # TODO: remove try/except when 0.24.0 is the legacy version. from pandas.arrays import SparseArray except ImportError: from pandas.core.sparse.api import SparseArray _loose_version = LooseVersion(pandas.__version__) def _create_sp_series(): nan = np.nan # nan-based arr = np.arange(15, dtype=np.float64) arr[7:12] = nan arr[-1:] = nan bseries = Series(SparseArray(arr, kind="block")) bseries.name = "bseries" return bseries def _create_sp_tsseries(): nan = np.nan # nan-based arr = np.arange(15, dtype=np.float64) arr[7:12] = nan arr[-1:] = nan date_index = bdate_range("1/1/2011", periods=len(arr)) bseries = Series(SparseArray(arr, kind="block"), index=date_index) bseries.name = "btsseries" return bseries def _create_sp_frame(): nan = np.nan data = { "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], "C": np.arange(10).astype(np.int64), "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], } dates = bdate_range("1/1/2011", periods=10) return DataFrame(data, index=dates).apply(SparseArray) def create_data(): """ create the pickle data """ data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) index = dict( int=Index(np.arange(10)), date=date_range("20130101", periods=10), period=period_range("2013-01-01", freq="M", periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range("00:00:00", freq="30T", periods=10), ) index["range"] = RangeIndex(10) if _loose_version >= LooseVersion("0.21"): from pandas import interval_range index["interval"] = interval_range(0, periods=10) mi = dict( reg2=MultiIndex.from_tuples( tuple( zip( *[ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] ) ), names=["first", "second"], ) ) series = dict( float=Series(data["A"]), int=Series(data["B"]), mixed=Series(data["E"]), ts=Series( np.arange(10).astype(np.int64), index=date_range("20130101", periods=10) ), mi=Series( np.arange(5).astype(np.float64), index=MultiIndex.from_tuples( tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] ), ), dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), cat=Series(Categorical(["foo", "bar", "baz"])), dt=Series(date_range("20130101", periods=5)), dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), period=Series([Period("2000Q1")] * 5), ) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict( float=DataFrame({"A": series["float"], "B": series["float"] + 1}), int=DataFrame({"A": series["int"], "B": series["int"] + 1}), mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), mi=DataFrame( {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)}, index=MultiIndex.from_tuples( tuple( zip( *[ ["bar", "bar", "baz", "baz", "baz"], ["one", "two", "one", "two", "three"], ] ) ), names=["first", "second"], ), ), dup=DataFrame( np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] ), cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), cat_and_float=DataFrame( { "A": Categorical(["foo", "bar", "baz"]), "B": np.arange(3).astype(np.int64), } ), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ), dt_mixed2_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), "C": Timestamp("20130603", tz="UTC"), }, index=range(5), ), ) cat = dict( int8=Categorical(list("abcdefg")), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000)), ) timestamp = dict( normal=Timestamp("2011-01-01"), nat=NaT, tz=Timestamp("2011-01-01", tz="US/Eastern"), ) timestamp["freq"] = Timestamp("2011-01-01", freq="D") timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") off = { "DateOffset": DateOffset(years=1), "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), "SemiMonthBegin": SemiMonthBegin(day_of_month=9), "SemiMonthEnd": SemiMonthEnd(day_of_month=24), "MonthBegin": MonthBegin(1), "MonthEnd": MonthEnd(1), "QuarterBegin": QuarterBegin(1), "QuarterEnd": QuarterEnd(1), "Day": Day(1), "YearBegin": YearBegin(1), "YearEnd": YearEnd(1), "Week": Week(1), "Week_Tues": Week(2, normalize=False, weekday=1), "WeekOfMonth": WeekOfMonth(week=3, weekday=4), "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), "Easter": Easter(), "Hour": Hour(1), "Minute": Minute(1), } return dict( series=series, frame=frame, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off, ) def create_pickle_data(): data = create_data() return data def platform_name(): return "_".join( [ str(pandas.__version__), str(pl.machine()), str(pl.system().lower()), str(pl.python_version()), ] ) def write_legacy_pickles(output_dir): version = pandas.__version__ print( "This script generates a storage file for the current arch, system, " "and python version" ) print(f" pandas version: {version}") print(f" output dir : {output_dir}") print(" storage format: pickle") pth = f"{platform_name()}.pickle" fh = open(os.path.join(output_dir, pth), "wb") pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) fh.close() print(f"created pickle file: {pth}") def write_legacy_file(): # force our cwd to be the first searched sys.path.insert(0, ".") if not (3 <= len(sys.argv) <= 4): exit( "Specify output directory and storage type: generate_legacy_" "storage_files.py " ) output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) if storage_type == "pickle": write_legacy_pickles(output_dir=output_dir) else: exit("storage_type must be one of {'pickle'}") if __name__ == "__main__": write_legacy_file()