I am using polars to read a parquet file from S3 as a dataframe:
use cloud::AmazonS3ConfigKey as Key;
use eyre::Result;
use polars::prelude::*;
use dotenvy::dotenv;
fn lazyframe_from_s3_key(key: &str) -> Result<LazyFrame> {
let access_key_id = std::env::var("aws_access_key_id")?;
let secret_access_key = std::env::var("aws_secret_access_key")?;
let region = std::env::var("aws_region")?;
let cloud_options = cloud::CloudOptions::default().with_aws([
(Key::AccessKeyId, access_key_id),
(Key::SecretAccessKey, secret_access_key),
(Key::Region, region),
]);
let args = ScanArgsParquet {
cloud_options: Some(cloud_options),
..Default::default()
};
let df = LazyFrame::scan_parquet(key, args)?
.with_streaming(true);
Ok(df)
}
macro_rules! time {
($msg:expr, $($stmts:stmt);+) => {
let now = std::time::Instant::now();
$($stmts)*
println!("{}: {:.2?}", $msg, now.elapsed());
};
}
pub fn dataframe_from_s3_key(key: &str) -> Result<DataFrame> {
time!("lazyframe scan",
let lf = lazyframe_from_s3_key(key)?);
time!("lazyframe collect",
let df = lf.collect()?);
Ok(df)
}
fn main() -> Result<()> {
dotenv()?;
let df = dataframe_from_s3_key("s3://xxx/test.parquet")?;
dbg!(&df);
Ok(())
}
When I run this on a 60MB parquet file, it takes ~30s, almost all that time in the collect() call When I do the equivalent in Python:
import polars as pl
df = pl.scan_parquet("s3://xxx/test.parquet").collect()
it takes only about 5s. What is Python polars doing that I'm not? How can I get the Rust version to load as quickly?