There are some Pandas DataFrame manipulations that I keep looking up how to do. I am recording these here to save myself time. These may help you too.
Time series data
Convert column to datetime with given format
df[‘day_time’] = pd.to_datetime(df[‘day_time’], format=’%Y-%m-%d %H:%M:%S’)0 2012–10–12 00:00:00
1 2012–10–12 00:30:00
2 2012–10–12 01:00:00
3 2012–10–12 01:30:00
Re-index a dataframe to interpolate missing values (eg every 30 mins below). You need to have a datetime index on the df before running this.
full_idx = pd.date_range(start=df[‘day_time’].min(), end=df[‘day_time’].max(), freq=’30T’)df = (
df
.groupby(‘LCLid’, as_index=False)
.apply(lambda group: group.reindex(full_idx, method=’nearest’))
.reset_index(level=0, drop=True)
.sort_index()
)
Find missing dates in a DataFrame
# Note date_range is inclusive of the end date
ref_date_range = pd.date_range(‘2012–2–5 00:00:00’, ‘2014–2–8 23:30:00’, freq=’30Min’)ref_df = pd.DataFrame(np.random.randint(1, 20, (ref_date_range.shape[0], 1)))
ref_df.index = ref_date_range# check for missing datetimeindex values based on reference index (with all values)
missing_dates = ref_df.index[~ref_df.index.isin(df.index)]missing_dates>>DatetimeIndex(['2013-09-09 23:00:00', '2013-09-09 23:30:00',
'2013-09-10 00:00:00', '2013-09-10 00:30:00'],
dtype='datetime64[ns]', freq='30T')
Split a dataframe based on a date in a datetime column
split_date = pd.datetime(2014,2,2)df_train = df.loc[df[‘day_time’] < split_date]
df_test = df.loc[df[‘day_time’] >= split_date]
Find nearest date in dataframe (here we assume index is a datetime field)
dt = pd.to_datetime(“2016–04–23 11:00:00”)df.index.get_loc(dt, method=“nearest”)#get index date
idx = df.index[df.index.get_loc(dt, method='nearest')]#row to series
s = df.iloc[df.index.get_loc(dt, method='nearest')]
Calculate a delta between datetimes in rows (assuming index is datetime)
df[‘t_val’] = df.indexdf[‘delta’] = (df[‘t_val’]-df[‘t_val’].shift()).fillna(0)
Calculate a running delta between date column and a given date (eg here we use first date in the date column as the date we want to difference to).
dt = pd.to_datetime(str(train_df[‘date’].iloc[0]))dt
>>Timestamp('2016-01-10 00:00:00')train_df['elapsed']=pd.Series(delta.seconds for delta in (train_df['date'] - dt))#convert seconds to hours
train_df['elapsed'] = train_df['elapsed'].apply(lambda x: x/3600)
Housekeeping
Reset index
data
day_time
2014-02-02 0.45
2014-02-02 0.41df.reset_index(inplace=True) day_time data
0 2014-02-02 0.45
0 2014-02-02 0.41#to drop it
df.reset_index(drop=True, inplace=True)
Set index
df = df.set_index(“day_time”)
Reset index, don’t keep the original index
df = df.reset_index(drop=True)
Drop column(s)
df.drop(columns=[‘col_to_drop’,'other_col_to_drop'],inplace=True)
Drop rows that contain a duplicate value in a specific column(s)
df=df.drop_duplicates(subset=['id'])
Rename column(s)
df.rename(columns={‘oldName1’: ‘newName1’, ‘oldName2’: ‘newName2’}, inplace=True)
Sort dataframe by column_1 then column_2, in ascending order
df.sort_values(by=['column_1', 'column_2'])#descending
df.sort_values(by='column_1', ascending=0)
Sort dataframe using a list (sorts by column ‘id’ using given list order of id’s)
ids_sort_by=[34g,56gf,2w,34nb]df['id_cat'] = pd.Categorical(
df['id'],
categories=ids_sort_by,
ordered=True
)df=df.sort_values('id_cat')
Selection
Select rows from a DataFrame based on values in a column in pandas
Super useful snippets after https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
df.loc[df[‘column_name’] == some_value]df.loc[df['column_name'].isin(some_values)]df.loc[(df['column_name'] == some_value) & df['other_column'].isin(some_values)]
Select columns from dataframe
df1 = df[['a','b']]
Get unique values in a column
acorns = df.Acorn.unique()#same as
acorns = df['Acorn'].unique()
Get row where value in column is a minimum
lowest_row = df.iloc[df[‘column_1’].argmin()]
Select by row number
my_series = df.iloc[0]
my_df = df.iloc[[0]]
Select by column number
df.iloc[:,0]
Get column names for maximum value in each row
classes=df.idxmax(axis=1)
Select 70% of Dataframe rows
df_n = df.sample(frac=0.7)
Randomly select n rows from a Dataframe
df_n = df.sample(n=20)
Select rows where a column doesn’t (remove tilda for does) contain a substring
df[~df['name'].str.contains("mouse")]
Select rows containing a substring from a list of substrings
#eg current df['id'] consists of ['23_a', '23_b','45_1','45_2']
core_ds=['23','45']
df=df[df['id'].str.contains('|'.join(core_ids))]
Select duplicated rows based on selected columns
dup_df=df_loss[df_loss.duplicated(['id','model'])]
Select duplicated rows based on all columns (returns all except first occurrence)
dup_df=df_loss[df_loss.duplicated()]
Select using query then set value for specific column. In the example below we search the dataframe on the ‘island’ column and ‘vegetation’ column, and for the matches we set the ‘biodiversity’ column to ‘low’
df.loc[(df['island'] == 'zanzibar') & (df['vegetation'] == 'cleared'), ['biodiversity']]='low'
Create 10 fold Train/Test Splits
Similar to selecting to a % of dataframe rows, we can repeat randomly to create 10 fold train/test set splits using a 90/10 train test split ratio.
#tt_splits.pyimport pandas as pd
DATA_PATH=some_path_to_data
train_df = pd.read_csv(DATA_PATH+'train.csv')
def create_folds():
train_dfs=[]
val_dfs = []
for n in range(10):
train_n_df = train_df.sample(frac=0.9).copy()
val_n_df=train_df[~train_df.isin(train_n_df)].dropna()
train_dfs.append(train_n_df)
val_dfs.append(val_n_df)
return train_dfs,val_dfs
def write_folds(train_dfs,val_dfs):
i=0
for t,v in zip(train_dfs, val_dfs):
t.to_csv(DATA_PATH+f'fold_{i}_train.csv', index=False)
v.to_csv(DATA_PATH+f'fold_{i}_test.csv', index=False)
i+=1
if __name__ == '__main__':
train_dfs,val_dfs = create_folds()
write_folds(train_dfs, val_dfs)
Group by
Group by columns, get most common occurrence of string in other column (eg class predictions on different runs of a model).
#id model_name pred
#34g4 resnet50 car
#34g4 resnet50 busmode_df=temp_df.groupby(['id', 'model_name'])['pred'].agg(pd.Series.mode).to_frame()
Group by column, apply operation then convert result to dataframe
df = df(['LCLid']).mean().reset_index()
Replacement
Replace rows in dataframe with rows from another dataframe with same index.
#for example first I created a new dataframe based on a selectiondf_b = df_a.loc[df_a['machine_id'].isnull()]#replace column with value from another columnfor i in df_b.index:
df_b.at[i, 'machine_id'] = df_b.at[i, 'box_id']#now replace rows in original dataframedf_a.loc[df_b.index] = df_b
Replace value in column(s) by row index
df.loc[0:2,'col'] = 42
Replace substring in column. See pandas docs for regex use
pred_df['id'] = pred_df['id'].str.replace('_raw', '')
Iterate over rows
Using iterrows
for index, row in df.iterrows():
print (row["type"], row["value"])
Using itertuples (faster, see https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas)
for row in df.itertuples():
print (getattr(row, "type"), getattr(row, "value"))
If you need to modify the rows you are iterating, use apply:
def my_fn(c):
return c + 1
df['plus_one'] = df.apply(lambda row: my_fn(row['value']), axis=1)
Or alternatively, see good explanation here and example below:
for i in df.index:
if <something>:
df.at[i, 'ifor'] = x
else:
df.at[i, 'ifor'] = y
NaN’s
Replace NaN in df or column with zeros (or value)
df.fillna(0)df['some_column'].fillna(0, inplace=True)
Count NaN’s in column
df[‘energy(kWh/hh)’].isna().sum()
Find which columns have Nans, list of those columns, and select columns with one or more NaNs. After https://stackoverflow.com/questions/36226083/how-to-find-which-columns-contain-any-nan-value-in-pandas-dataframe-python
#which cols have nan
df.isna().any()
#list of cols with nan
df.columns[df.isna().any()].tolist()
#select cols with nan
df.loc[:, df.isna().any()]
Get rows where column is NaN
df[df['Col2'].isnull()]
Data Analysis
Show last n rows of dataframe
df.tail(n=2)
Show transpose of dataframe head. we pass in len(list(df)) as number to head to show all the columns
df.head().T.head(len(list(df)))>> 0 1 2 3 4
index 2012-02-05 00:00:00 2012-02-05 00:00:00 2012-02-05 00:00:00 2012-02-05 00:00:00 2012-02-05 00:00:00
LCLid MAC000006 MAC005178 MAC000066 MAC004510 MAC004882
energy(kWh/hh) 0.042 0.561 0.037 0.254 0.426
dayYear 2012 2012 2012 2012 2012
dayMonth 2 2 2 2 2
dayWeek 5 5 5 5 5
dayDay 5 5 5 5 5
dayDayofweek 6 6 6 6 6
dayDayofyear 36 36 36 36 36
Calculate the mean for each cell across multiple dataframes
df_concat = pd.concat((df_1, df_2, df_3, df_4))
by_row_index = df_concat.groupby(df_concat.index)
df_means = by_row_index.mean()
Calculate sum across all columns for each row
df_means['Sum'] = df_means.sum(axis=1)
String operations
Replace a specific character in column
df[‘bankHoliday’] = df[‘bankHoliday’].str.replace(‘?’,’’)
Concatenate two columns
df['concat'] = df["id"].astype(str) + '-' + df["name"]
Merge
Merge DataFrame on multiple columns
df = pd.merge(X, y, on=[‘city’,’year’,’weekofyear’])
Concat / Append vertically
(see https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html)
df = df1.append(df2, ignore_index=True)#or frames = [df1, df2, df3]
result = pd.concat(frames)
Split
Split dataframe into N roughly equal sized dataframes
idxs=df.index.valueschunked = np.array_split(idxs, NUM_CORES)for chunk in chunked:
part_df = df.loc[df.index.isin(chunk)] #run some process on the part
p= Process(target=proc_chunk, args=[part_df])
jobs.append(p)
p.start()
Split a column string by the last occurrence of a substring, create new columns
df[['model_name','run']] = df.model.str.rsplit(pat="-", n=1, expand = True)
Type conversion
Change column type in dataframe
df_test[[‘value’]] = df_test[[‘value’]].astype(int)
Add data
Add an empty column
df["nan_column"] = np.nandf["zero_column"] = 0
Data types
Convert columns ‘a’ and ‘b’ to numeric, coerce non numeric to ‘NaN’
df[['a', 'b']] = df[['a', 'b']].apply(pd.to_numeric, errors='coerce')
Creating DataFrames
From list of dicts
df = pd.DataFrame([sig_dict, id_dict, phase_dict, target_dict])df=df.Tdf.columns=[‘signal’,’id’,’phase’,’target’]
From list
missing=['dog','cat','frog']
df=pd.DataFrame({"missing":missing})
From multiple lists
df=pd.DataFrame(list(zip(mylist1, mylist2, mylist3)),
columns=['title1','title2', 'title3'])
Numpy
As an alternative method to concatenating dataframes, you can use numpy (less memory intensive than pandas-useful for large merges)
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7,8]])
a, b>(array([[1, 2],
[3, 4]]), array([[5, 6],
[7, 8]]))c=np.concatenate((a, b), axis=1)
c>array([[1, 2, 5, 6],
[3, 4, 7, 8]])df = pd.DataFrame(c)
df.head()>0 1 2 3
0 1 2 5 6
1 3 4 7 8for i in range(10):
df = pq.read_table(path+f’df_{i}.parquet’).to_pandas()
vals = df.values
if i > 0:
#axis=1 to concat horizontally np_vals = np.concatenate((np_vals, vals), axis=1)
else:
np_vals=vals
np.savetxt(path+f'df_np.csv', np_vals, delimiter=",")
Import/Export
Group by a column, then export each group into a separate dataframe
f = lambda x: x.to_csv(“{1}.csv”.format(x.name.lower()), index=False)
df.groupby(‘LCLid’).apply(f)#for example our original dataframe may be:day_time LCLid energy(kWh/hh)
289 2012–02–05 00:00:00 MAC004954 0.45
289 2012–02–05 00:30:00 MAC004954 0.46
6100 2012–02–05 05:30:00 MAC000041 0.23
Import / Export in Feather format
Here we save a DataFrame in feather format (really fast to read back in). Note I have an issue saving feather files >~2GB using pandas==0.23.4
df.to_feather(‘df_data.feather’)import feather as ftrdf = ftr.read_dataframe(‘df_data.feather’)
Import / Export in Parquet format
import pyarrow.parquet as pqdf.to_parquet(“data.parquet”)df = pq.read_table(“data.parquet”).to_pandas()
Save without index
df.to_csv('file.csv', index=False)
Read in, specifying new column names
df = pd.read_csv('signals.csv', names=['phase', 'amplitude'])
datetime64 Date and Time Codes
from here: https://docs.scipy.org/doc/numpy/reference/arrays.datetime.html
Code Meaning
Y year
M month
W week
D day
time units:
Code Meaning
h hour
m minute
s second
ms millisecond
us microsecond
ns nanosecond
ps picosecond
fs femtosecond
as attosecond