Some Techniques to fill NaNs

Some Techniques to fill NaNs

Methods such as forward fill, backward fill, cumulative & reverse cumulative sum were used in this post.
"""
I created a sample data with four columns - group, time_count, count, 
accumulated_count. The accumulated_count column contains the NaNs. I tried to show various
ways by which NaNs can be filled, namely; forward fill, backward fill and a complex way which 
compares the difference between current and previous; and current and next values of 
another row (time_count) to decide how to fill the NaN. It backward fills when the later is 
greater and vice-versa. Various methods such as cumulative 
sum and reverse cumulative sum were deployed to achieve these. You are free to modify to 
your use case.
"""
# import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# create sample data frame
data = pd.DataFrame({'group': [1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3],
                    'time_count': [10,13,16,31,33,35,39,41,61,65,22,23,32,35,52,56,58,62,64,65,13,23,26,29,43,48,50,53,63,73],
                    'count': [0,1,2,1,0,3,2,3,1,0,5,2,1,0,3,2,3,1,0,3,2,3,1,0,5,2,1,0,3,2],
                    'accumulated_count': [np.nan,np.nan,9,np.nan,np.nan,np.nan,16,np.nan,np.nan,28,5,np.nan,np.nan,19,np.nan,\
                                          np.nan,np.nan,50,np.nan,np.nan,np.nan,np.nan,35,np.nan,np.nan,np.nan,np.nan,np.nan,\
                                          np.nan,50]})
# view data
# data

# label rows with nan as False and notnan as True
data['nan_or_val'] = data['accumulated_count'].map(lambda x: x.is_integer())

# Logic
df_all = [] # list to store data for each group
for group, df in data.groupby('group'): # loop through each group
    df['backward_fill'] = df.accumulated_count.fillna(method='bfill') # backward fill approach
    df['forward_fill'] = df.accumulated_count.fillna(method='ffill') # forward fill approach
    
    # conditional fill nan where the value of current nan is determined by difference between current and previous 
    # and current and next values of another column
    df['curr_prev_time_count_diff'] = abs(df['time_count'].diff()).fillna(method='bfill') # difference between current and previous row
    df['curr_next_time_count_diff'] = abs(df['time_count'].diff(-1)) # difference between current and next row
    df['prev_or_next'] = df['curr_prev_time_count_diff'] < df['curr_next_time_count_diff'] # boolean to tell which is lesser
    df['backward_cumsum'] = df[::-1].groupby(df['nan_or_val'].eq(True).cumsum())['count'].cumsum() # backward or reverse cumulative sum
    df['forward_cumsum'] = df.groupby(df['nan_or_val'].eq(True).shift().cumsum())['count'].cumsum() # forward cumulative sum
    
    # logic to fill nans by combining previous calculated columns
    df['calc_accumulated_count'] = df.apply(lambda x:
                                            (x['forward_fill'] + x['forward_cumsum'])
                                            if ((pd.isna(x['backward_fill'])) & (x['nan_or_val'] == False) &\
                                                (x['prev_or_next'] == False))
                                            else (x['backward_fill'] - x['backward_cumsum'])
                                            if ((x['nan_or_val'] == False) & (x['prev_or_next'] == False))
                                            else (x['backward_fill'] - x['backward_cumsum'])
                                            if (((x['nan_or_val'] == False) & (x['prev_or_next'] == True)) &\
                                                (x['curr_prev_time_count_diff'] <= x['curr_next_time_count_diff']) &\
                                                (x['backward_cumsum'] < x['forward_cumsum']))
                                            else (x['forward_fill'] + x['forward_cumsum'])
                                            if (((x['nan_or_val'] == False) & (x['prev_or_next'] == True)) &\
                                                (x['curr_prev_time_count_diff'] <= x['curr_next_time_count_diff']) &\
                                                (x['backward_cumsum'] > x['forward_cumsum']))
                                            else x['accumulated_count'], axis=1)
    # append each output to df_all list
    df_all +=[df] 

# concatenate each group data
new_data = pd.concat(df_all, axis=0) 

# drop columns not needed
new_data.drop(columns=['nan_or_val', 'curr_prev_time_count_diff', 'prev_or_next', 'curr_next_time_count_diff'], inplace=True)

# view output
new_data
1