Handling NULL Values in DataFrames

 

NULL values are inevitable in real-world datasets, and handling them properly is crucial for data quality and analytical accuracy—understanding when to drop, fill, or handle NULLs strategically can make the difference between accurate insights and misleading conclusions—this final practical topic teaches you comprehensive strategies for managing missing data in production systems.

 

Understanding NULLs

What are NULLs?

NULLs represent missing, unknown, or undefined values. They're different from empty strings, zero, or false.

from pyspark .sql import SparkSession

from pyspark .sql .functions import col , isnan , isnull , count , when

 

spark = SparkSession .builder .appName ( "NullHandling" ).getOrCreate ()

 

# Create sample data with NULLs

data = [

    ( 1 , "Alice" , 28 , 75000 , "Engineering" ),

    ( 2 , "Bob" , None , 85000 , "Sales" ),

    ( 3 , None , 25 , 65000 , "Marketing" ),

    ( 4 , "Diana" , 35 , None , "Engineering" ),

    ( 5 , "Eve" , 29 , 80000 , None ),

]

 

df = spark .createDataFrame ( data , [ "id" , "name" , "age" , "salary" , "department" ])

df .show ()

 

# Output shows NULL values as blank

# +—+—–+—-+——+———-+

# | id| name| age|salary|department |

# +—+—–+—-+——+———-+

# |  1 |Alice| 28| 75000|Engineering|

# |  2|  Bob |null | 85000|      Sales|

# |  3 | null| 25| 65000 |  Marketing |

# |  4 |Diana| 35 |  null |Engineering |

# |  5|  Eve | 29| 80000|      null|

# +—+—–+—-+——+———-+

 

 

 

Identifying NULLs

Count NULLs per Column

# Method 1: Using filter and count

for col_name in df .columns :

    null_count = df .filter ( col ( col_name ).isNull () ).count ()

    print ( f " { col_name } : { null_count } NULLs" )

 

# Output:

# id: 0 NULLs

# name: 1 NULLs

# age: 1 NULLs

# salary: 1 NULLs

# department: 1 NULLs

 

 

Create NULL Summary Report

from pyspark .sql .functions import count , when

 

# Count NULLs for all columns

null_counts = df .select (

    [ count ( when ( col ( c ).isNull (), c ) ).alias ( c ) for c in df .columns ]

)

 

null_ counts .show ()

 

# Output:

# +—+—-+—+——+———-+

# | id|name|age|salary|department |

# +—+—-+—+——+———-+

# |  0 |   1 |  1 |     1|         1|

# +—+—-+—+——+———-+

 

 

Find Rows with NULLs

# Show rows with any NULL

df .filter ( col ( "salary" ).isNull () ).show ()

 

# Show rows with NULL in specific columns

df .filter (( col ( "name" ).isNull ()) | ( col ( "age" ).isNull ()) ).show ()

 

# Show rows with NULL in all columns (rare)

df .filter (( col ( "name" ).isNull ()) & ( col ( "age" ).isNull ()) ).show ()

 

Strategy 1: Drop NULLs

Drop Rows with ANY NULL

# Remove any row with at least one NULL

df_clean = df .dropna ()

 

print ( f "Before : { df .count () } rows" )

print ( f "After : { df_ clean .count () } rows" )

 

df_ clean .show ()

 

# Output:

# Before: 5 rows

# After: 1 rows

# Only Alice remains (she has all values)

 

 

Drop Rows with NULL in Specific Columns

# Drop only if NULL in critical columns

df_clean = df .dropna ( subset =[ "id" , "name" , "salary" ])

 

print ( f "Before : { df .count () } rows" )

print ( f "After : { df_ clean .count () } rows" )

 

df_ clean .show ()

 

# Output:

# Before: 5 rows

# After: 4 rows

# Drops row 3 (Diana has NULL salary) but keeps others

 

 

Drop Only if ALL Values are NULL

# Drop only if all columns are NULL

df_clean = df .dropna ( how = "all" )

 

print ( f "After dropna (how='all'): { df_ clean .count () } rows" )

# All 5 rows remain (no row has all NULLs)

 

Strategy 2: Fill NULLs

Fill with Specific Values

# Fill with defaults

df_filled = df .fillna ( {

    "name" : "Unknown" ,

    "age" : 0 ,

    "salary" : 50000 ,

    "department" : "Unassigned"

} )

 

df_ filled .show ()

 

# Output:

# +—+——-+—+——+———-+

# | id|   name|age|salary|department |

# +—+——-+—+——+———-+

# |  1|  Alice | 28| 75000|Engineering|

# |  2 |    Bob| 0| 85000|      Sales|

# |  3 |Unknown| 25| 65000 |  Marketing |

# |  4|  Diana | 35| 50000|Engineering|

# |  5 |    Eve| 29| 80000| Unassigned|

# +—+——-+—+——+———-+

Fill with Statistics

from pyspark .sql .functions import mean , percentile_approx

 

# Calculate mean salary

mean_salary = df .agg ( { "salary" : " avg " } ).collect ()

 

# Fill NULLs with mean

df_mean_filled = df .fillna ( { "salary" : mean_salary } )

 

print ( f "Mean salary: { mean_salary } " )

 

# Fill with median

median_salary = df .approxQuantile ( "salary" , [ 0.5 ], 0.05 )

df_median_filled = df .fillna ( { "salary" : median_salary } )

 

print ( f "Median salary: { median_salary } " )

 

 

Fill Different Columns Differently

# Different fill strategies for different columns

df_filled = df .fillna ( {

    "name" : "Unknown" ,

    "age" : 30 ,  # Default age

    "salary" : df .agg ( { "salary" : " avg " } ).collect ( ),  # Mean

    "department" : "Unassigned "  # Default department

} )

 

 

 

Strategy 3: Use Coalesce

Select First Non-NULL Value

from pyspark .sql .functions import coalesce

 

# Use nickname if available, otherwise use name

df .select (

    col ( "id" ),

    coalesce ( col ( "nickname" ), col ( "name" ) ).alias ( " display_name " ),

    col ( "salary" )

).show ()

 

# Multiple options

df .select (

    col ( "id" ),

    coalesce (

        col ( " phone_mobile " ),

        col ( " phone_home " ),

        col ( " phone_work " ),

        lit ( "No phone" )

    ).alias ( " contact_phone " )

).show ()

 

 

 

Strategy 4: Conditional Handling

Use when/otherwise for NULLs

from pyspark .sql .functions import when

 

# Mark rows with NULLs

df .withColumn (

    " data _quality " ,

    when (

        ( col ( "name" ).isNotNull ()) &

        ( col ( "age" ).isNotNull ()) &

        ( col ( "salary" ).isNotNull ()),

        "Complete"

    ).otherwise ( "Incomplete" )

).show ()

 

# Output:

# +—+—–+—-+——+———-+————–+

# | id| name| age|salary|department|data_ quality  |

# +—+—–+—-+——+———-+————–+

# |  1 |Alice| 28| 75000|Engineering|Complete      |

# |  2|  Bob |null | 85000|      Sales|Incomplete    |

# …

 

 

 

Practical Example: Comprehensive NULL Handling

from pyspark .sql .functions import *

 

# Start with raw data

df = spark .read .csv ( "employees_messy.csv" , header = True , inferSchema = True )

 

print ( "=== NULL Analysis ===" )

null_counts = df .select (

    [ count ( when ( col ( c ).isNull (), c ) ).alias ( c ) for c in df .columns ]

)

null_ counts .show ()

 

# Strategy: Drop critical NULLs, fill optional with defaults

df_clean = df .dropna ( subset =[ "id" , "name" , "salary" ])  # Drop rows missing key fields

 

df_clean = df_ clean .fillna ( {

    "age" : 0 ,

    "department" : "Unassigned" ,

    " manager _id " : 1 ,

    "phone" : "Not provided"

} )

 

# Mark data quality

df_clean = df_ clean .withColumn (

    " data _quality " ,

    when (

        ( col ( "age" ) == 0 ) |

        ( col ( "department" ) == "Unassigned" ),

        "Imputed"

    ).otherwise ( "Original" )

)

 

print ( " n === After Cleaning ===" )

print ( f "Rows removed: { df .count () df_ clean .count () } " )

print ( f "Final row count: { df_ clean .count () } " )

 

df_ clean .show ()

 

 

 

Performance Considerations

NULL Check Before Operations

# GOOD: Check for NULL before mathematical operation

df .withColumn (

    " salary _per_month " ,

    when ( col ( "salary" ).isNotNull (), col ( "salary" ) / 12 )

    .otherwise ( 0 )

).show ()

 

# BAD: Division by zero if NULL not handled

df .withColumn ( " salary_per_month " , col ( "salary" ) / 12 ).show ()

# Results in NULL or error

 

 

Cache After NULL Handling

# Handle NULLs once, cache for reuse

df_clean = df .dropna ( subset =[ "id" , "name" ])

             .fillna ( { "salary" : 50000 } )

 

df_ clean .cache ()  # Keep in memory

df_ clean .count ()  # Trigger caching

 

# Now all downstream operations use clean cached data

 

 

 

Decision Tree: When to Drop vs Fill

Found NULLs?

  ↓

Is column critical?  →  YES  → Drop rows with NULL

  ↓ NO

Can you impute safely?  →  YES  → Fill with mean/median

  ↓ NO

Can you mark as unknown?  →  YES  → Fill with "Unknown"

  ↓ NO

Keep as NULL and handle downstream

 

 

💡 Best Practices

1.  Analyze NULLs first: Understand patterns before handling

2.  Document your strategy: Why drop vs fill specific columns

3.  Don't lose too much data: Balance quality vs row count

4.  Keep original column: Create new column for imputed values if possible

5.  Validate results: Verify NULL handling didn't introduce errors

6.  Use statistics: Mean/median better than arbitrary defaults

7.  Mark imputed data: Track which values were filled

 

 

📚 Study Notes

•   dropna ( ): Remove rows with NULLs

•   fillna ( ): Replace NULLs with values

•   coalesce( ): Return first non-NULL value

•   isNull ( ), isNotNull ( ): Check for NULL values

•   count(when(…)): Count NULLs per column

•   Strategies: Drop (lose data), Fill (keep with defaults), Coalesce (pick best), Mark (track quality)

 

 

Leave a Reply