Reading JSON Files with Nested Structures

 

JSON files often contain nested structures, arrays, and objects that require specialized handling techniques—mastering JSON ingestion in Spark enables you to work with modern API responses, hierarchical data, and complex real-world datasets—understanding how to flatten, explode, and navigate nested JSON transforms Spark from a tabular tool into a powerful tool for semi-structured data.

JSON vs CSV

Aspect

CSV

JSON

Format

Flat, tabular

Nested, hierarchical

Nesting

Not supported

Full support

Arrays

Not native

Native support

Readability

Easy

Complex for nested

Performance

Fast

Slower (more parsing)

 

Reading Flat JSON

JSON Lines Format (Most Common)

# Create sample JSON (one JSON object per line)

cat > users .jsonl << 'EOF'

{ "id" :1 , "name" :"Alice" , "age" :28 , "city" :"New York" }

{ "id" :2 , "name" :"Bob" , "age" :32 , "city" :"San Francisco" }

{ "id" :3 , "name" :"Charlie" , "age" :25 , "city" :"Boston" }

EOF

Read with Spark

from pyspark .sql import SparkSession

 

spark = SparkSession .builder .appName ( " ReadJSON " ).getOrCreate ()

 

# Read JSON lines (default format)

df = spark .read .json ( " users.jsonl " )

 

df .printSchema ()

# root

#  |– age: long (nullable = true)

#  |– city: string (nullable = true)

#  |– id: long (nullable = true)

#  |– name: string (nullable = true)

 

df .show ()

 

 

Reading Nested JSON

Nested Structure Example

cat > orders .jsonl << 'EOF'

{ "order_id" :1 , "customer " :{ "name" :"Alice" , "age" :28 } , "items " :[ { "product" :"Laptop" , "price" :1000 } , { "product" :"Mouse" , "price" :50 } ], "total" :1050 }

{ "order_id" :2 , "customer " :{ "name" :"Bob" , "age" :32 } , "items " :[ { "product" :"Phone" , "price" :800 } ], "total" :800 }

EOF

 

 

Read and Explore

df = spark .read .json ( " orders.jsonl " )

 

df .printSchema ()

# root

#  |– customer: struct (nullable = true)

#  |    |– age: long (nullable = true)

#  |    |– name: string (nullable = true)

#  |– items: array (nullable = true)

#  |    |– element: struct ( containsNull = true)

#  |    |    |– price: long (nullable = true)

#  |    |    |– product: string (nullable = true)

#  |– order_id : long (nullable = true)

#  |– total: long (nullable = true)

 

df .show ( truncate = False )

 

 

Accessing Nested Fields

Dot Notation

# Access nested fields

df .select ( " order_id " , "customer.name" , " customer.age " ).show ()

 

# Output:

# +——–+——-+—+

# | order_id |   name|age |

# +——–+——-+—+

# |       1 |  Alice | 28|

# |       2|    Bob| 32|

# +——–+——-+—+

 

 

Column Method

from pyspark .sql .functions import col

 

# Alternative syntax

df .select (

    col ( " order_id " ),

    col ( "customer.name" ),

    col ( " customer.age " )

).show ()

 

 

 

Working with Arrays

Explode Arrays

from pyspark .sql .functions import explode

 

# Flatten array of items

df_exploded = df .select (

    " order _id " ,

    explode ( "items" ).alias ( "item" )

)

 

df_ exploded .show ()

# +——–+——————–+

# | order_id |                item|

# +——–+——————–+

# |       1 |[ Laptop, 1000]      |

# |       1 |[ Mouse, 50]         |

# |       2 |[ Phone, 800]        |

# +——–+——————–+

 

# Access nested fields in exploded array

df_ exploded .select (

    " order _id " ,

    " item.product " ,

    " item.price "

).show ()

 

 

 

Multiline JSON Files

Regular JSON Format (Pretty Printed)

cat > config .json << 'EOF'

{

  "database" : {

    "host" : "localhost" ,

    "port" : 5432 ,

    "name" : " mydb "

  } ,

  "cache" : {

    "enabled" : true ,

    " ttl " : 3600

  }

}

EOF

 

 

Read Multiline JSON

# For pretty-printed JSON files

df = spark .read .option ( "multiline" , "true" ).json ( " config.json " )

df .show ()

 

 

 

Complete Practical Example

Realistic API Response

cat > api_ response .jsonl << 'EOF'

{ "user_id" :1 , "user_name" :"Alice" , "email" :"alice@example.com" , "profile" :{ "age" :28 , "location" :"New York" , "interests " :[ " Python" , "Data Science" ] } , "purchases" :[ { "id" :"P001" , "product" :"Laptop" , "price" :1000 , "date" :"2024-01-15" } , { "id" :"P002" , "product" :"Mouse" , "price" :50 , "date" :"2024-01-20" } ] }

{ "user_id" :2 , "user_name" :"Bob" , "email" :"bob@example.com" , "profile " :{ "age" :32 , "location" :"San Francisco" , "interests " :[ "Java" , "Cloud" ] } , "purchases " :[ { "id" :"P003" , "product" :"Phone" , "price" :800 , "date" :"2024-02-01" } ] }

EOF

 

 

Parse and Analyze

from pyspark .sql .functions import explode , col , to_date

 

spark = SparkSession .builder .appName ( " ParseJSON " ).getOrCreate ()

 

df = spark .read .json ( " api_ response.jsonl " )

 

print ( "=== Schema ===" )

df .printSchema ()

 

print ( " n === Basic Info ===" )

df .select ( " user_id " , " user_name " , "email" ).show ()

 

print ( " n === Nested Profile ===" )

df .select (

    " user _id " ,

    " profile.age " ,

    " profile.location "

).show ()

 

print ( " n === Exploded Interests ===" )

df .select (

    " user _id " ,

    explode ( " profile.interests " ).alias ( "interest" )

).show ()

 

print ( " n === Exploded Purchases ===" )

purchases = df .select (

    " user _id " ,

    " user _name " ,

    explode ( "purchases" ).alias ( "purchase" )

)

 

purchases .select (

    " user _id " ,

    " user _name " ,

    col ( " purchase.product " ),

    col ( " purchase.price " )

).show ()

 

spark .stop ()

 

 

 

Common Patterns

Pattern 1: Flatten Nested Structure

from pyspark .sql .functions import col

 

df_flat = df .select (

    " order _id " ,

    col ( "customer.name" ).alias ( " customer_name " ),

    col ( " customer.age " ).alias ( " customer_age " ),

    col ( "total" )

)

 

df_ flat .show ()

 

 

Pattern 2: Working with Complex Nesting

from pyspark .sql .functions import explode

 

# Multiple levels of nesting

df_processed = df .select (

    " user _id " ,

    col ( " profile.location " ).alias ( "location" ),

    explode ( " profile.interests " ).alias ( "interest" ),

    explode ( "purchases" ).alias ( "purchase" )

)

 

df_ processed .select (

    " user _id " ,

    "location" ,

    "interest" ,

    col ( " purchase.product " ),

    col ( " purchase.price " )

).show ()

 

 

💡 Key Points

•   JSON files can have nested objects (struct) and arrays

•   Use dot notation or col( ) to access nested fields

•   Use explode( ) to flatten arrays

•   multiLine =true for pretty-printed JSON

•   Schema is automatically inferred for JSON

📚 Study Notes

•   JSON lines: One JSON object per line (efficient)

•   Multiline JSON: Pretty-printed JSON (single object per file)

•   Nested objects: Access with dot notation (customer.name)

•   Arrays: Use explode( ) to flatten

•   Column access: df [" col.nested "] or col(" col.nested ")

•   Complex nesting: Combine multiple explode( ) calls

•   Performance: explode( ) can increase row count significantly

 

Leave a Reply