I have a Dataframe like this (in Pyspark 2.3.1):

from pyspark.sql import Row

my_data = spark.createDataFrame([
  Row(a=[9, 3, 4], b=['a', 'b', 'c'], mask=[True, False, False]),
  Row(a=[7, 2, 6, 4], b=['w', 'x', 'y', 'z'], mask=[True, False, True, False])
#|a           |b           |mask                      |
#|[9, 3, 4]   |[a, b, c]   |[true, false, false]      |
#|[7, 2, 6, 4]|[w, x, y, z]|[true, false, true, false]|

Now I'd like to use the mask column in order to subset the a and b columns:

my_desired_output = spark.createDataFrame([
  Row(a=[9], b=['a']),
  Row(a=[7, 6], b=['w', 'y'])
#|a     |b     |
#|[9]   |[a]   |
#|[7, 6]|[w, y]|

What's the "idiomatic" way to achieve this? The current solution I have involves map-ing over the underlying RDD and subsetting with Numpy, which seems inelegant:

import numpy as np

def subset_with_mask(row):
    mask = np.asarray(row.mask)
    a_masked = np.asarray(row.a)[mask].tolist()
    b_masked = np.asarray(row.b)[mask].tolist()
    return Row(a=a_masked, b=b_masked)

my_desired_output = spark.createDataFrame(my_data.rdd.map(subset_with_mask))

Is this the best way to go, or is there something better (less verbose and/or more efficient) I can do using Spark SQL tools?

3 Answers

shadowtalker On

One option is to use a UDF, which you can optionally specialize by the data type in the array:

import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T

def _mask_list(lst, mask):
    return np.asarray(lst)[mask].tolist()

mask_array_int = F.udf(_mask_list, T.ArrayType(T.IntegerType()))
mask_array_str = F.udf(_mask_list, T.ArrayType(T.StringType()))

my_desired_output = my_data
my_desired_output = my_desired_output.withColumn(
    'a', mask_array_int(F.col('a'), F.col('mask'))
my_desired_output = my_desired_output.withColumn(
    'b', mask_array_str(F.col('b'), F.col('mask'))
ollik1 On

UDFs mentioned in the previous answer is probably the way to go prior to the array functions added in Spark 2.4. For the sake of completeness, here is a "pure SQL" implementation before 2.4.

from pyspark.sql.functions import *

df = my_data.withColumn("row", monotonically_increasing_id())

df1 = df.select("row", posexplode("a").alias("pos", "a"))
df2 = df.select("row", posexplode("b").alias("pos", "b"))
df3 = df.select("row", posexplode("mask").alias("pos", "mask"))

    .join(df2, ["row", "pos"])\
    .join(df3, ["row", "pos"])\
    .agg(collect_list("a").alias("a"), collect_list("b").alias("b"))\
    .select("a", "b")\


|     a|     b|
|[7, 6]|[w, y]|
|   [9]|   [a]|
Alexandros Biratsis On

This is one more approach with 2 UDFs for zipping and unzipping the lists:

from pyspark.sql.types import ArrayType, StructType, StructField, StringType
from pyspark.sql.functions import udf, col, lit

zip_schema = ArrayType(StructType((StructField("a", StringType()), StructField("b", StringType()))))  
unzip_schema = ArrayType(StringType())

zip_udf = udf(my_zip, zip_schema)
unzip_udf = udf(my_unzip, unzip_schema)

df = my_data.withColumn("zipped", zip_udf(col("a"), col("b"), col("mask")))
       .withColumn("a", unzip_udf(col("zipped"), lit(0)))
       .withColumn("b", unzip_udf(col("zipped"), lit(1)))
       .drop("zipped", "mask")

def my_unzip(zipped, indx):
    return  [str(x[indx]) for x in zipped]

def my_zip(a, b, mask):
    return [(x[0], x[1]) for x in zip(a,b,mask) if x[2]]

my_zip is responsible filtering the data based on mask and creating a tuple of (cola, colb) which is also an item of the returned list.

my_unzip will extract the data for a specific indx from the data created with my_zip.


|     a|     b|
|   [9]|   [a]|
|[7, 6]|[w, y]|