
Some Techniques to fill NaNs
Methods such as forward fill, backward fill, cumulative & reverse cumulative sum were used in this post.
"""
I created a sample data with four columns - group, time_count, count,
accumulated_count. The accumulated_count column contains the NaNs. I tried to show various
ways by which NaNs can be filled, namely; forward fill, backward fill and a complex way which
compares the difference between current and previous; and current and next values of
another row (time_count) to decide how to fill the NaN. It backward fills when the later is
greater and vice-versa. Various methods such as cumulative
sum and reverse cumulative sum were deployed to achieve these. You are free to modify to
your use case.
"""
# import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
# create sample data frame
data = pd.DataFrame({'group': [1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3],
'time_count': [10,13,16,31,33,35,39,41,61,65,22,23,32,35,52,56,58,62,64,65,13,23,26,29,43,48,50,53,63,73],
'count': [0,1,2,1,0,3,2,3,1,0,5,2,1,0,3,2,3,1,0,3,2,3,1,0,5,2,1,0,3,2],
'accumulated_count': [np.nan,np.nan,9,np.nan,np.nan,np.nan,16,np.nan,np.nan,28,5,np.nan,np.nan,19,np.nan,\
np.nan,np.nan,50,np.nan,np.nan,np.nan,np.nan,35,np.nan,np.nan,np.nan,np.nan,np.nan,\
np.nan,50]})
# view data
# data
# label rows with nan as False and notnan as True
data['nan_or_val'] = data['accumulated_count'].map(lambda x: x.is_integer())
# Logic
df_all = [] # list to store data for each group
for group, df in data.groupby('group'): # loop through each group
df['backward_fill'] = df.accumulated_count.fillna(method='bfill') # backward fill approach
df['forward_fill'] = df.accumulated_count.fillna(method='ffill') # forward fill approach
# conditional fill nan where the value of current nan is determined by difference between current and previous
# and current and next values of another column
df['curr_prev_time_count_diff'] = abs(df['time_count'].diff()).fillna(method='bfill') # difference between current and previous row
df['curr_next_time_count_diff'] = abs(df['time_count'].diff(-1)) # difference between current and next row
df['prev_or_next'] = df['curr_prev_time_count_diff'] < df['curr_next_time_count_diff'] # boolean to tell which is lesser
df['backward_cumsum'] = df[::-1].groupby(df['nan_or_val'].eq(True).cumsum())['count'].cumsum() # backward or reverse cumulative sum
df['forward_cumsum'] = df.groupby(df['nan_or_val'].eq(True).shift().cumsum())['count'].cumsum() # forward cumulative sum
# logic to fill nans by combining previous calculated columns
df['calc_accumulated_count'] = df.apply(lambda x:
(x['forward_fill'] + x['forward_cumsum'])
if ((pd.isna(x['backward_fill'])) & (x['nan_or_val'] == False) &\
(x['prev_or_next'] == False))
else (x['backward_fill'] - x['backward_cumsum'])
if ((x['nan_or_val'] == False) & (x['prev_or_next'] == False))
else (x['backward_fill'] - x['backward_cumsum'])
if (((x['nan_or_val'] == False) & (x['prev_or_next'] == True)) &\
(x['curr_prev_time_count_diff'] <= x['curr_next_time_count_diff']) &\
(x['backward_cumsum'] < x['forward_cumsum']))
else (x['forward_fill'] + x['forward_cumsum'])
if (((x['nan_or_val'] == False) & (x['prev_or_next'] == True)) &\
(x['curr_prev_time_count_diff'] <= x['curr_next_time_count_diff']) &\
(x['backward_cumsum'] > x['forward_cumsum']))
else x['accumulated_count'], axis=1)
# append each output to df_all list
df_all +=[df]
# concatenate each group data
new_data = pd.concat(df_all, axis=0)
# drop columns not needed
new_data.drop(columns=['nan_or_val', 'curr_prev_time_count_diff', 'prev_or_next', 'curr_next_time_count_diff'], inplace=True)
# view output
new_data