Creating Derived Columns
Creating derived columns is the art of feature engineering in Spark—derived columns are computed from existing data through mathematical operations, string manipulations, conditional logic, or type conversions—mastering this skill enables you to build rich datasets that answer business questions and improve downstream analyses.
Basic Derived Columns with withColumn ( )
Arithmetic Operations
from pyspark .sql import SparkSession
from pyspark .sql .functions import col
spark = SparkSession .builder .appName ( " DerivedColumns " ).getOrCreate ()
df = spark .read .csv ( "employees.csv" , header = True , inferSchema = True )
# Add 10% raise
df .withColumn ( " raised_salary " , col ( "salary" ) * 1.1 ).show ()
# Bonus calculation (10% of salary)
df .withColumn ( "bonus" , col ( "salary" ) * 0.1 ).show ()
# Age in months
df .withColumn ( " age_months " , col ( "age" ) * 12 ).show ()
# Multiple derived columns
df_enhanced = df .withColumn ( " raised_salary " , col ( "salary" ) * 1.1 )
.withColumn ( "bonus" , col ( "salary" ) * 0.1 )
.withColumn ( " total_comp " , col ( "salary" ) + col ( "bonus" ))
df_ enhanced .show ()
String Operations
String Manipulation
from pyspark .sql .functions import col , concat , substring , upper , lower , length , trim , concat_ws
# Concatenate columns
df .withColumn ( " full_info " , concat ( col ( "name" ), " – " , col ( "salary" )) ).show ()
# Convert to uppercase
df .withColumn ( " name_upper " , upper ( col ( "name" )) ).show ()
# Convert to lowercase
df .withColumn ( " name_lower " , lower ( col ( "name" )) ).show ()
# Get first N characters
df .withColumn ( "name_first_3" , substring ( col ( "name" ), 1 , 3 ) ).show ()
# String length
df .withColumn ( " name_length " , length ( col ( "name" )) ).show ()
# Trim whitespace
df .withColumn ( " name_trimmed " , trim ( col ( "name" )) ).show ()
# Multiple columns concatenated
df .withColumn ( " full_address " , concat_ ws ( ", " , col ( "city" ), col ( "state" ), col ( "zip" )) ).show ()
Conditional Columns with when/otherwise
Simple Conditionals
from pyspark .sql .functions import when
# Single condition
df .withColumn (
" salary _level " ,
when ( col ( "salary" ) > 80000 , "High" ).otherwise ( "Low" )
).show ()
Multiple Conditions
# Multiple conditions (else-if)
df .withColumn (
" salary _category " ,
when ( col ( "salary" ) > 100000 , "Executive" )
.when ( col ( "salary" ) > 80000 , "Senior" )
.when ( col ( "salary" ) > 60000 , "Mid-Level" )
.otherwise ( "Junior" )
).show ()
# Result:
# +——-+——+———-+
# | name|salary|salary_cat |
# +——-+——+———-+
# | Alice | 75000| Mid-Level|
# | Bob| 85000| Senior|
# | Diana| 90000| Senior|
# | Eve | 110000|Executive|
# +——-+——+———-+
Nested Conditions
# Nested when/otherwise
df .withColumn (
" performance _tier " ,
when (
( col ( "salary" ) > 80000 ) & ( col ( " years_employed " ) > 5 ),
"Top Performer"
)
.when (
( col ( "salary" ) > 70000 ) | ( col ( " years_employed " ) > 3 ),
"Good Performer"
)
.otherwise ( "Standard" )
).show ()
Type Casting
Cast to Different Types
from pyspark .sql .functions import cast
from pyspark .sql .types import StringType , IntegerType , DoubleType , DateType
# Cast salary to string
df .withColumn ( " salary_str " , cast ( col ( "salary" ), StringType ()) ).show ()
# Cast age to double
df .withColumn ( " age_double " , cast ( col ( "age" ), DoubleType ()) ).show ()
# Cast string to date
df .withColumn ( " hire_date " , cast ( col ( " hire_date_str " ), DateType ()) ).show ()
# Alternative cast syntax
df .withColumn ( " salary_string " , col ( "salary" ).cast ( StringType ()) ).show ()
Date and Time Operations
Date Calculations
from pyspark .sql .functions import datediff , current_date , date_add , date_sub , year , month , dayofmonth
# Days since hire date
df .withColumn (
" days _employed " ,
datediff ( current_ date ( ), col ( " hire_date " ))
).show ()
# Years of employment
df .withColumn (
" years _employed " ,
datediff ( current_ date ( ), col ( " hire_date " )) / 365
).show ()
# Add days to date
df .withColumn ( " next_review " , date_add ( col ( " hire_date " ), 365 ) ).show ()
# Extract year
df .withColumn ( " hire_year " , year ( col ( " hire_date " )) ).show ()
# Extract month
df .withColumn ( " hire_month " , month ( col ( " hire_date " )) ).show ()
# Extract day
df .withColumn ( " hire_day " , dayofmonth ( col ( " hire_date " )) ).show ()
Advanced Derived Columns
Bucketing/Binning
# Create age groups
df .withColumn (
" age _group " ,
when ( col ( "age" ) < 25 , "20-24" )
.when ( col ( "age" ) < 30 , "25-29" )
.when ( col ( "age" ) < 35 , "30-34" )
.when ( col ( "age" ) < 40 , "35-39" )
.otherwise ( "40+" )
).show ()
Ranking and Cumulative Functions
from pyspark .sql .functions import rank , dense_rank , row_number
from pyspark .sql .window import Window
# Rank employees by salary
window_spec = Window .orderBy ( col ( "salary" ).desc ())
df .withColumn ( " salary_rank " , rank ( ).over ( window_spec ) ).show ()
Complex Formulas
# BMI calculation ( weight_kg / ( height_m ** 2))
df .withColumn (
" bmi " ,
col ( "weight" ) / ( col ( "height" ) ** 2 )
).show ()
# Compound interest
df .withColumn (
" future _value " ,
col ( "principal" ) * (( 1 + col ( "rate" ) / 100 ) ** col ( "years" ))
).show ()
# Percentage of total
total_salary = df .agg ( { "salary" : "sum" } ).collect ()
df .withColumn (
" salary _pct " ,
( col ( "salary" ) / total_salary ) * 100
).show ()
Practical Example: Feature Engineering
from pyspark .sql .functions import *
# Read employee data
df = spark .read .csv ( "employees.csv" , header = True , inferSchema = True )
# Create multiple derived columns
df_features = df
.withColumn ( " annual_bonus " , col ( "salary" ) * 0.15 )
.withColumn ( " total_compensation " , col ( "salary" ) + col ( " annual_bonus " ))
.withColumn ( " salary_level " ,
when ( col ( "salary" ) > 100000 , "High" )
.when ( col ( "salary" ) > 70000 , "Medium" )
.otherwise ( "Low" ))
.withColumn ( " is_senior " , col ( " years_employed " ) > 5 )
.withColumn ( " name_upper " , upper ( col ( "name" )))
.withColumn ( " salary_per_year_of_service " , col ( "salary" ) / col ( " years_employed " ))
.withColumn ( " is_high_value_employee " ,
when (
( col ( "salary" ) > 80000 ) & ( col ( " years_employed " ) > 3 ),
"Yes"
).otherwise ( "No" ))
.select (
"id" , " name_upper " , "age" , "salary" , " annual_bonus " ,
" total _compensation " , " salary_level " , " is_senior " ,
" salary _per_year_of_service " , " is_high_value_employee "
)
df_ features .show ()
# Result table with all derived columns
Performance Tips
Keep Track of Columns
# GOOD: Keep original columns if needed
df_enhanced = df .withColumn ( " raised_salary " , col ( "salary" ) * 1.1 )
# Now have both "salary" and " raised_salary "
# INEFFICIENT: Creating same column multiple times
df = df .withColumn ( "bonus_1" , col ( "salary" ) * 0.1 )
df = df .withColumn ( "bonus_2" , col ( "salary" ) * 0.15 )
df = df .withColumn ( "bonus_3" , col ( "salary" ) * 0.2 )
# BETTER: Create multiple at once
df_with_bonuses = df
.withColumn ( "bonus_10pct" , col ( "salary" ) * 0.1 )
.withColumn ( "bonus_15pct" , col ( "salary" ) * 0.15 )
.withColumn ( "bonus_20pct" , col ( "salary" ) * 0.2 )
💡 Best Practices
• Chain withColumn ( ) calls: More readable than repeated withColumn
• Use clear column names: Describe what the column represents
• Document complex formulas: Add comments for business logic
• Test with sample data: Verify calculations before full run
• Consider NULL values: Use coalesce ( ) if needed
• Avoid expensive operations: Move complex logic to where needed
📚 Study Notes
• withColumn ( ) : Add new column or replace existing
• Arithmetic: *, /, +, -, ** ( power )
• String ops: upper ( ), lower ( ), substring ( ), concat ( ), length ( )
• Conditional: when ( ), otherwise ( )
• Type casting: cast ( col , type )
• Dates: datediff ( ), date_ add ( ), year ( ), month ( ), day ( )
• Chaining: Multiple withColumn ( ) calls