Pyspark- Fill an empty strings with a '0' if Data type is BIGINT/DOUBLE/Integer

970 views Asked by At

I am trying to fill an empty strings with a '0' if column Data type is BIGINT/DOUBLE/Integer in a dataframe using pyspark

data = [("James","","Smith","36","M",3000,"1.2"),
    ("Michael","Rose"," ","40","M",4000,"2.0"),
    ("Robert","","Williams","42","M",4000,"5.0"),
    ("Maria","Anne"," ","39","F", ," "),
    ("Jen","Mary","Brown"," ","F",-1,"")
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("age", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True),
    StructField("amount", DoubleType(), True) 
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()

I am trying like this.

df.select( *[ F.when(F.dtype in ('integertype','doubletype') and F.col(column).ishaving(" "),'0').otherwise(F.col(column)).alias(column) for column in df.columns]).show()

Expected output:

+---------+----------+--------+---+------+------+------+                        
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
|    James|          |   Smith| 36|     M|  3000|   1.2|
|  Michael|      Rose|        | 40|     M|  4000|   2.0|
|   Robert|          |Williams| 42|     M|  4000|   5.0|
|    Maria|      Anne|        | 39|     F|     0|     0|
|      Jen|      Mary|   Brown|   |     F|    -1|     0|
+---------+----------+--------+---+------+------+------+
2

There are 2 answers

3
Vaebhav On BEST ANSWER

You can utilise reduce to accomplish this , it makes the code more cleaner and easier to understand

Additionally create a to_fill list to match the columns based on your condition , which can be further modified based on your scenarios.

Data Preparation

data = [("James","","Smith","36","M",3000,1.2),
    ("Michael","Rose"," ","40","M",4000,2.0),
    ("Robert","","Williams","42","M",4000,5.0),
    ("Maria","Anne"," ","39","F",None,None),
    ("Jen","Mary","Brown"," ","F",-1,None)
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("age", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True),
    StructField("amount", DoubleType(), True) 
  ])
 
sparkDF = sql.createDataFrame(data=data,schema=schema)
sparkDF.show()

+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
|    James|          |   Smith| 36|     M|  3000|   1.2|
|  Michael|      Rose|        | 40|     M|  4000|   2.0|
|   Robert|          |Williams| 42|     M|  4000|   5.0|
|    Maria|      Anne|        | 39|     F|  null|  null|
|      Jen|      Mary|   Brown|   |     F|    -1|  null|
+---------+----------+--------+---+------+------+------+

Reduce

to_fill =  [ c for c,d in sparkDF.dtypes if d in ['int','bigint','double']] 

# to_fill --> ['salary','amount']

sparkDF = reduce(
    lambda df, x: df.withColumn(x, F.when(F.col(x).isNull(),0).otherwise(F.col(x))),
    to_fill,
    sparkDF,
)

sparkDF.show()

+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
|    James|          |   Smith| 36|     M|  3000|   1.2|
|  Michael|      Rose|        | 40|     M|  4000|   2.0|
|   Robert|          |Williams| 42|     M|  4000|   5.0|
|    Maria|      Anne|        | 39|     F|     0|   0.0|
|      Jen|      Mary|   Brown|   |     F|    -1|   0.0|
+---------+----------+--------+---+------+------+------+
0
SherKhan On

You can try this :

from pyspark.sql import *
    from pyspark.sql.functions import *
    from pyspark.sql.types import *
    
    spark = SparkSession.builder.master("local").appName("test").getOrCreate()
    data = [("James", "", "Smith", "36", "", 3000, 1.2),
            ("Michael", "Rose", "", "40", "M", 4000, 2.0),
            ("Robert", "", "Williams", "42", "M", 4000, 5.0),
            ("Maria", "Anne", " ", "39", "F", None, None),
            ("Jen", "Mary", "Brown", " ", "F", -1, None)
            ]
    
    schema = StructType([StructField("firstname", StringType(), True),StructField("middlename", StringType(), True),StructField("lastname", StringType(), True),StructField("age", StringType(), True),StructField("gender", StringType(), True),StructField("salary", IntegerType(), True),StructField("amount", DoubleType(), True)])
    
    dfa = spark.createDataFrame(data=data, schema=schema)
    dfa.show()
    
    
    def removenull(dfa):
            dfa = dfa.select([trim(col(c)).alias(c) for c in dfa.columns])
            for i in dfa.columns:
                    dfa = dfa.withColumn(i , when(col(i)=="", None ).otherwise(col(i)))
            return dfa
    
    removenull(dfa).show()

output:

+---------+----------+--------+----+------+------+------+
|firstname|middlename|lastname| age|gender|salary|amount|
+---------+----------+--------+----+------+------+------+
|    James|      null|   Smith|  36|  null|  3000|   1.2|
|  Michael|      Rose|    null|  40|     M|  4000|   2.0|
|   Robert|      null|Williams|  42|     M|  4000|   5.0|
|    Maria|      Anne|    null|  39|     F|  null|  null|
|      Jen|      Mary|   Brown|null|     F|    -1|  null|
+---------+----------+--------+----+------+------+------+