Creating Derived Columns

 

Creating derived columns is the art of feature engineering in Spark—derived columns are computed from existing data through mathematical operations, string manipulations, conditional logic, or type conversions—mastering this skill enables you to build rich datasets that answer business questions and improve downstream analyses.

 

Basic Derived Columns with withColumn ( )

Arithmetic Operations

from pyspark .sql import SparkSession

from pyspark .sql .functions import col

 

spark = SparkSession .builder .appName ( " DerivedColumns " ).getOrCreate ()

 

df = spark .read .csv ( "employees.csv" , header = True , inferSchema = True )

 

# Add 10% raise

df .withColumn ( " raised_salary " , col ( "salary" ) * 1.1 ).show ()

 

# Bonus calculation (10% of salary)

df .withColumn ( "bonus" , col ( "salary" ) * 0.1 ).show ()

 

# Age in months

df .withColumn ( " age_months " , col ( "age" ) * 12 ).show ()

 

# Multiple derived columns

df_enhanced = df .withColumn ( " raised_salary " , col ( "salary" ) * 1.1 )

                .withColumn ( "bonus" , col ( "salary" ) * 0.1 )

                .withColumn ( " total_comp " , col ( "salary" ) + col ( "bonus" ))

 

df_ enhanced .show ()

 

 

 

String Operations

String Manipulation

from pyspark .sql .functions import col , concat , substring , upper , lower , length , trim , concat_ws

 

# Concatenate columns

df .withColumn ( " full_info " , concat ( col ( "name" ), " – " , col ( "salary" )) ).show ()

 

# Convert to uppercase

df .withColumn ( " name_upper " , upper ( col ( "name" )) ).show ()

 

# Convert to lowercase

df .withColumn ( " name_lower " , lower ( col ( "name" )) ).show ()

 

# Get first N characters

df .withColumn ( "name_first_3" , substring ( col ( "name" ), 1 , 3 ) ).show ()

 

# String length

df .withColumn ( " name_length " , length ( col ( "name" )) ).show ()

 

# Trim whitespace

df .withColumn ( " name_trimmed " , trim ( col ( "name" )) ).show ()

 

# Multiple columns concatenated

df .withColumn ( " full_address " , concat_ ws ( ", " , col ( "city" ), col ( "state" ), col ( "zip" )) ).show ()

 

 

 

Conditional Columns with when/otherwise

Simple Conditionals

from pyspark .sql .functions import when

 

# Single condition

df .withColumn (

    " salary _level " ,

    when ( col ( "salary" ) > 80000 , "High" ).otherwise ( "Low" )

).show ()

 

 

Multiple Conditions

# Multiple conditions (else-if)

df .withColumn (

    " salary _category " ,

    when ( col ( "salary" ) > 100000 , "Executive" )

    .when ( col ( "salary" ) > 80000 , "Senior" )

    .when ( col ( "salary" ) > 60000 , "Mid-Level" )

    .otherwise ( "Junior" )

).show ()

 

# Result:

# +——-+——+———-+

# |   name|salary|salary_cat |

# +——-+——+———-+

# |  Alice | 75000| Mid-Level|

# |    Bob| 85000|    Senior|

# | Diana| 90000|    Senior|

# |  Eve | 110000|Executive|

# +——-+——+———-+

 

 

Nested Conditions

# Nested when/otherwise

df .withColumn (

    " performance _tier " ,

    when (

        ( col ( "salary" ) > 80000 ) & ( col ( " years_employed " ) > 5 ),

        "Top Performer"

    )

    .when (

        ( col ( "salary" ) > 70000 ) | ( col ( " years_employed " ) > 3 ),

        "Good Performer"

    )

    .otherwise ( "Standard" )

).show ()

 

 

 

Type Casting

Cast to Different Types

from pyspark .sql .functions import cast

from pyspark .sql .types import StringType , IntegerType , DoubleType , DateType

 

# Cast salary to string

df .withColumn ( " salary_str " , cast ( col ( "salary" ), StringType ()) ).show ()

 

# Cast age to double

df .withColumn ( " age_double " , cast ( col ( "age" ), DoubleType ()) ).show ()

 

# Cast string to date

df .withColumn ( " hire_date " , cast ( col ( " hire_date_str " ), DateType ()) ).show ()

 

# Alternative cast syntax

df .withColumn ( " salary_string " , col ( "salary" ).cast ( StringType ()) ).show ()

 

 

Date and Time Operations

Date Calculations

from pyspark .sql .functions import datediff , current_date , date_add , date_sub , year , month , dayofmonth

 

# Days since hire date

df .withColumn (

    " days _employed " ,

    datediff ( current_ date ( ), col ( " hire_date " ))

).show ()

 

# Years of employment

df .withColumn (

    " years _employed " ,

    datediff ( current_ date ( ), col ( " hire_date " )) / 365

).show ()

 

# Add days to date

df .withColumn ( " next_review " , date_add ( col ( " hire_date " ), 365 ) ).show ()

 

# Extract year

df .withColumn ( " hire_year " , year ( col ( " hire_date " )) ).show ()

 

# Extract month

df .withColumn ( " hire_month " , month ( col ( " hire_date " )) ).show ()

 

# Extract day

df .withColumn ( " hire_day " , dayofmonth ( col ( " hire_date " )) ).show ()

 

 

 

Advanced Derived Columns

Bucketing/Binning

# Create age groups

df .withColumn (

    " age _group " ,

    when ( col ( "age" ) < 25 , "20-24" )

    .when ( col ( "age" ) < 30 , "25-29" )

    .when ( col ( "age" ) < 35 , "30-34" )

    .when ( col ( "age" ) < 40 , "35-39" )

    .otherwise ( "40+" )

).show ()

 

 

Ranking and Cumulative Functions

from pyspark .sql .functions import rank , dense_rank , row_number

from pyspark .sql .window import Window

 

# Rank employees by salary

window_spec = Window .orderBy ( col ( "salary" ).desc ())

df .withColumn ( " salary_rank " , rank ( ).over ( window_spec ) ).show ()

 

 

Complex Formulas

# BMI calculation ( weight_kg / ( height_m ** 2))

df .withColumn (

    " bmi " ,

    col ( "weight" ) / ( col ( "height" ) ** 2 )

).show ()

 

# Compound interest

df .withColumn (

    " future _value " ,

    col ( "principal" ) * (( 1 + col ( "rate" ) / 100 ) ** col ( "years" ))

).show ()

 

# Percentage of total

total_salary = df .agg ( { "salary" : "sum" } ).collect ()

df .withColumn (

    " salary _pct " ,

    ( col ( "salary" ) / total_salary ) * 100

).show ()

 

 

 

Practical Example: Feature Engineering

from pyspark .sql .functions import *

 

# Read employee data

df = spark .read .csv ( "employees.csv" , header = True , inferSchema = True )

 

# Create multiple derived columns

df_features = df

    .withColumn ( " annual_bonus " , col ( "salary" ) * 0.15 )

    .withColumn ( " total_compensation " , col ( "salary" ) + col ( " annual_bonus " ))

    .withColumn ( " salary_level " ,

        when ( col ( "salary" ) > 100000 , "High" )

        .when ( col ( "salary" ) > 70000 , "Medium" )

        .otherwise ( "Low" ))

    .withColumn ( " is_senior " , col ( " years_employed " ) > 5 )

    .withColumn ( " name_upper " , upper ( col ( "name" )))

    .withColumn ( " salary_per_year_of_service " , col ( "salary" ) / col ( " years_employed " ))

    .withColumn ( " is_high_value_employee " ,

        when (

            ( col ( "salary" ) > 80000 ) & ( col ( " years_employed " ) > 3 ),

            "Yes"

        ).otherwise ( "No" ))

    .select (

        "id" , " name_upper " , "age" , "salary" , " annual_bonus " ,

        " total _compensation " , " salary_level " , " is_senior " ,

        " salary _per_year_of_service " , " is_high_value_employee "

    )

 

df_ features .show ()

 

# Result table with all derived columns

 

 

 

Performance Tips

Keep Track of Columns

# GOOD: Keep original columns if needed

df_enhanced = df .withColumn ( " raised_salary " , col ( "salary" ) * 1.1 )

# Now have both "salary" and " raised_salary "

 

# INEFFICIENT: Creating same column multiple times

df = df .withColumn ( "bonus_1" , col ( "salary" ) * 0.1 )

df = df .withColumn ( "bonus_2" , col ( "salary" ) * 0.15 )

df = df .withColumn ( "bonus_3" , col ( "salary" ) * 0.2 )

 

# BETTER: Create multiple at once

df_with_bonuses = df

    .withColumn ( "bonus_10pct" , col ( "salary" ) * 0.1 )

    .withColumn ( "bonus_15pct" , col ( "salary" ) * 0.15 )

    .withColumn ( "bonus_20pct" , col ( "salary" ) * 0.2 )

 

 

💡 Best Practices

•   Chain withColumn ( ) calls: More readable than repeated withColumn

•   Use clear column names: Describe what the column represents

•   Document complex formulas: Add comments for business logic

•   Test with sample data: Verify calculations before full run

•   Consider NULL values: Use coalesce ( ) if needed

•   Avoid expensive operations: Move complex logic to where needed

📚 Study Notes

•   withColumn ( ) : Add new column or replace existing

•   Arithmetic: *, /, +, -, ** ( power )

•   String ops: upper ( ), lower ( ), substring ( ), concat ( ), length ( )

•   Conditional: when ( ), otherwise ( )

•   Type casting: cast ( col , type )

•   Dates: datediff ( ), date_ add ( ), year ( ), month ( ), day ( )

•   Chaining: Multiple withColumn ( ) calls

 

 

 

Leave a Reply