Pandas for time series data — tricks and tips

Time series data

df[‘day_time’] = pd.to_datetime(df[‘day_time’], format=’%Y-%m-%d %H:%M:%S’)0 2012–10–12 00:00:00
1 2012–10–12 00:30:00
2 2012–10–12 01:00:00
3 2012–10–12 01:30:00
full_idx = pd.date_range(start=df[‘day_time’].min(), end=df[‘day_time’].max(), freq=’30T’)df = (
df
.groupby(‘LCLid’, as_index=False)
.apply(lambda group: group.reindex(full_idx, method=’nearest’))
.reset_index(level=0, drop=True)
.sort_index()
)
# Note date_range is inclusive of the end date
ref_date_range = pd.date_range(‘2012–2–5 00:00:00’, ‘2014–2–8 23:30:00’, freq=’30Min’)
ref_df = pd.DataFrame(np.random.randint(1, 20, (ref_date_range.shape[0], 1)))
ref_df.index = ref_date_range
# check for missing datetimeindex values based on reference index (with all values)
missing_dates = ref_df.index[~ref_df.index.isin(df.index)]
missing_dates>>DatetimeIndex(['2013-09-09 23:00:00', '2013-09-09 23:30:00',
'2013-09-10 00:00:00', '2013-09-10 00:30:00'],
dtype='datetime64[ns]', freq='30T')
split_date = pd.datetime(2014,2,2)df_train = df.loc[df[‘day_time’] < split_date]
df_test = df.loc[df[‘day_time’] >= split_date]
dt = pd.to_datetime(“2016–04–23 11:00:00”)df.index.get_loc(dt, method=“nearest”)#get index date
idx = df.index[df.index.get_loc(dt, method='nearest')]
#row to series
s = df.iloc[df.index.get_loc(dt, method='nearest')]
df[‘t_val’] = df.indexdf[‘delta’] = (df[‘t_val’]-df[‘t_val’].shift()).fillna(0)
dt = pd.to_datetime(str(train_df[‘date’].iloc[0]))dt
>>Timestamp('2016-01-10 00:00:00')
train_df['elapsed']=pd.Series(delta.seconds for delta in (train_df['date'] - dt))#convert seconds to hours
train_df['elapsed'] = train_df['elapsed'].apply(lambda x: x/3600)

Housekeeping

            data
day_time
2014-02-02 0.45
2014-02-02 0.41
df.reset_index(inplace=True) day_time data
0 2014-02-02 0.45
0 2014-02-02 0.41
#to drop it
df.reset_index(drop=True, inplace=True)
df = df.set_index(“day_time”)
df = df.reset_index(drop=True)
df.drop(columns=[‘col_to_drop’,'other_col_to_drop'],inplace=True)
df=df.drop_duplicates(subset=['id'])
df.rename(columns={‘oldName1’: ‘newName1’, ‘oldName2’: ‘newName2’}, inplace=True)
df.sort_values(by=['column_1', 'column_2'])#descending
df.sort_values(by='column_1', ascending=0)
ids_sort_by=[34g,56gf,2w,34nb]df['id_cat'] = pd.Categorical(
df['id'],
categories=ids_sort_by,
ordered=True
)
df=df.sort_values('id_cat')
df.loc[df[‘column_name’] == some_value]df.loc[df['column_name'].isin(some_values)]df.loc[(df['column_name'] == some_value) & df['other_column'].isin(some_values)]
df1 = df[['a','b']]
acorns = df.Acorn.unique()#same as
acorns = df['Acorn'].unique()
lowest_row = df.iloc[df[‘column_1’].argmin()]
my_series = df.iloc[0]
my_df = df.iloc[[0]]
df.iloc[:,0]
classes=df.idxmax(axis=1)
df_n = df.sample(frac=0.7)
df_n = df.sample(n=20)
df[~df['name'].str.contains("mouse")]
#eg current df['id'] consists of ['23_a', '23_b','45_1','45_2']
core_ds=['23','45']
df=df[df['id'].str.contains('|'.join(core_ids))]
dup_df=df_loss[df_loss.duplicated(['id','model'])]
dup_df=df_loss[df_loss.duplicated()]
df.loc[(df['island'] == 'zanzibar') & (df['vegetation'] == 'cleared'), ['biodiversity']]='low'

Create 10 fold Train/Test Splits

#tt_splits.pyimport pandas as pd

DATA_PATH=some_path_to_data
train_df = pd.read_csv(DATA_PATH+'train.csv')

def create_folds():
train_dfs=[]
val_dfs = []
for n in range(10):
train_n_df = train_df.sample(frac=0.9).copy()
val_n_df=train_df[~train_df.isin(train_n_df)].dropna()
train_dfs.append(train_n_df)
val_dfs.append(val_n_df)
return train_dfs,val_dfs

def write_folds(train_dfs,val_dfs):
i=0
for t,v in zip(train_dfs, val_dfs):
t.to_csv(DATA_PATH+f'fold_{i}_train.csv', index=False)
v.to_csv(DATA_PATH+f'fold_{i}_test.csv', index=False)
i+=1

if __name__ == '__main__':
train_dfs,val_dfs = create_folds()
write_folds(train_dfs, val_dfs)

Group by

#id   model_name  pred
#34g4 resnet50 car
#34g4 resnet50 bus
mode_df=temp_df.groupby(['id', 'model_name'])['pred'].agg(pd.Series.mode).to_frame()
df = df(['LCLid']).mean().reset_index()

Replacement

#for example first I created a new dataframe based on a selectiondf_b = df_a.loc[df_a['machine_id'].isnull()]#replace column with value from another columnfor i in df_b.index:
df_b.at[i, 'machine_id'] = df_b.at[i, 'box_id']
#now replace rows in original dataframedf_a.loc[df_b.index] = df_b
df.loc[0:2,'col'] = 42
pred_df['id'] = pred_df['id'].str.replace('_raw', '')

Iterate over rows

for index, row in df.iterrows():
print (row["type"], row["value"])
for row in df.itertuples():
print (getattr(row, "type"), getattr(row, "value"))
def my_fn(c):
return c + 1

df['plus_one'] = df.apply(lambda row: my_fn(row['value']), axis=1)
for i in df.index:
if <something>:
df.at[i, 'ifor'] = x
else:
df.at[i, 'ifor'] = y

NaN’s

df.fillna(0)df['some_column'].fillna(0, inplace=True)
df[‘energy(kWh/hh)’].isna().sum()
#which cols have nan
df.isna().any()
#list of cols with nan
df.columns[df.isna().any()].tolist()
#select cols with nan
df.loc[:, df.isna().any()]
df[df['Col2'].isnull()]

Data Analysis

df.tail(n=2)
df.head().T.head(len(list(df)))>>             0  1  2  3  4
index 2012-02-05 00:00:00 2012-02-05 00:00:00 2012-02-05 00:00:00 2012-02-05 00:00:00 2012-02-05 00:00:00
LCLid MAC000006 MAC005178 MAC000066 MAC004510 MAC004882
energy(kWh/hh) 0.042 0.561 0.037 0.254 0.426
dayYear 2012 2012 2012 2012 2012
dayMonth 2 2 2 2 2
dayWeek 5 5 5 5 5
dayDay 5 5 5 5 5
dayDayofweek 6 6 6 6 6
dayDayofyear 36 36 36 36 36
df_concat = pd.concat((df_1, df_2, df_3, df_4))
by_row_index = df_concat.groupby(df_concat.index)
df_means = by_row_index.mean()
df_means['Sum'] = df_means.sum(axis=1)

String operations


df[‘bankHoliday’] = df[‘bankHoliday’].str.replace(‘?’,’’)
df['concat'] = df["id"].astype(str) + '-' + df["name"]

Merge

df = pd.merge(X, y, on=[‘city’,’year’,’weekofyear’])

Concat / Append vertically

df = df1.append(df2, ignore_index=True)#or frames = [df1, df2, df3]

result = pd.concat(frames)

Split

idxs=df.index.valueschunked = np.array_split(idxs, NUM_CORES)for chunk in chunked:
part_df = df.loc[df.index.isin(chunk)]
#run some process on the part
p= Process(target=proc_chunk, args=[part_df])
jobs.append(p)
p.start()
df[['model_name','run']] = df.model.str.rsplit(pat="-", n=1, expand = True)

Type conversion

df_test[[‘value’]] = df_test[[‘value’]].astype(int)

Add data

df["nan_column"] = np.nandf["zero_column"] = 0

Data types

df[['a', 'b']] = df[['a', 'b']].apply(pd.to_numeric, errors='coerce')

Creating DataFrames

df = pd.DataFrame([sig_dict, id_dict, phase_dict, target_dict])df=df.Tdf.columns=[‘signal’,’id’,’phase’,’target’]
missing=['dog','cat','frog']
df=pd.DataFrame({"missing":missing})
df=pd.DataFrame(list(zip(mylist1, mylist2, mylist3)),
columns=['title1','title2', 'title3'])

Numpy

a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7,8]])
a, b
>(array([[1, 2],
[3, 4]]), array([[5, 6],
[7, 8]]))
c=np.concatenate((a, b), axis=1)
c
>array([[1, 2, 5, 6],
[3, 4, 7, 8]])
df = pd.DataFrame(c)
df.head()
>0 1 2 3
0 1 2 5 6
1 3 4 7 8
for i in range(10):
df = pq.read_table(path+f’df_{i}.parquet’).to_pandas()
vals = df.values
if i > 0:
#axis=1 to concat horizontally
np_vals = np.concatenate((np_vals, vals), axis=1)
else:
np_vals=vals
np.savetxt(path+f'df_np.csv', np_vals, delimiter=",")

Import/Export

f = lambda x: x.to_csv(“{1}.csv”.format(x.name.lower()), index=False)
df.groupby(‘LCLid’).apply(f)
#for example our original dataframe may be:day_time LCLid energy(kWh/hh)
289 2012–02–05 00:00:00 MAC004954 0.45
289 2012–02–05 00:30:00 MAC004954 0.46
6100 2012–02–05 05:30:00 MAC000041 0.23
df.to_feather(‘df_data.feather’)import feather as ftrdf = ftr.read_dataframe(‘df_data.feather’)
import pyarrow.parquet as pqdf.to_parquet(“data.parquet”)df = pq.read_table(“data.parquet”).to_pandas()
df.to_csv('file.csv', index=False)
df = pd.read_csv('signals.csv', names=['phase', 'amplitude'])

datetime64 Date and Time Codes

Code Meaning 
Y year
M month
W week
D day
Code Meaning
h hour
m minute
s second
ms millisecond
us microsecond
ns nanosecond
ps picosecond
fs femtosecond
as attosecond

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store