docs: Update S3 snippets for Polars 0.20 (windmill-labs#451)

gbouv · web-flow · commit 922e6cc1082d · 2024-01-05T13:33:36.000+01:00
diff --git a/blog/2023-11-24-data-pipeline-orchestrator/index.mdx b/blog/2023-11-24-data-pipeline-orchestrator/index.mdx
@@ -159,7 +159,7 @@ In the end, a canonical pipeline step in Windmill will look something like this:
 
 ```python
 #requirements:
-#polars==0.19.19
+#polars==0.20.2
 #s3fs==2023.12.0
 #wmill>=1.229.0
 
@@ -172,35 +172,36 @@ import wmill
 def main(input_dataset: S3Object):
     # initialization: connect Polars to the workspace bucket
     s3_resource = wmill.get_resource("/path/to/resource")
-    s3fs_args = wmill.polars_connection_settings().s3fs_args
-    s3 = s3fs.S3FileSystem(**s3fs_args)
+    storage_options = wmill.polars_connection_settings().storage_options
 
     # reading data from s3:
     bucket = s3_resource["bucket"]
     input_dataset_uri = "s3://{}/{}".format(bucket, input_dataset["s3"])
-    output_dataset_uri = "s3://{}/output.parquet".format(bucket)
-    with s3.open(input_dataset_uri, mode="rb") as input_dataset, s3.open(output_dataset_uri, mode="rb") as output_dataset:
-        input = pl.read_parquet(input_dataset)
-
-        # transforming the data
-        output = (
-            input.filter(pl.col("L_SHIPDATE") >= datetime.datetime(1994, 1, 1))
-                .filter(
-                    pl.col("L_SHIPDATE")
-                    < datetime.datetime(1994, 1, 1) + datetime.timedelta(days=365)
-                )
-                .filter((pl.col("L_DISCOUNT").is_between(0.06 - 0.01, 0.06 + 0.01)))
-                .filter(pl.col("L_QUANTITY") < 24)
-                .select([(pl.col("L_EXTENDEDPRICE") * pl.col("L_DISCOUNT")).alias("REVENUE")])
-                .sum()
-                .collect()
-        )
-
-        # writing the output back to S3
+	input = pl.read_parquet(input_dataset_uri, storage_options=storage_options)
+
+	# transforming the data
+	output = (
+		input.filter(pl.col("L_SHIPDATE") >= datetime.datetime(1994, 1, 1))
+			.filter(
+				pl.col("L_SHIPDATE")
+				< datetime.datetime(1994, 1, 1) + datetime.timedelta(days=365)
+			)
+			.filter((pl.col("L_DISCOUNT").is_between(0.06 - 0.01, 0.06 + 0.01)))
+			.filter(pl.col("L_QUANTITY") < 24)
+			.select([(pl.col("L_EXTENDEDPRICE") * pl.col("L_DISCOUNT")).alias("REVENUE")])
+			.sum()
+			.collect()
+	)
+
+	# writing the output back to S3
+    s3 = s3fs.S3FileSystem(**wmill.polars_connection_settings().s3fs_args)
+	output_dataset_filename = "output.parquet"
+    output_dataset_uri = "s3://{}/{}".format(bucket, output_dataset_filename)
+    with s3.open(output_dataset_uri, mode="rb") as output_dataset:
         output.write_parquet(output_dataset)
 
     # returning the URI of the output for next steps to process it
-    return S3Object(s3=output_dataset_uri)
+    return S3Object(s3=output_dataset_filename)
 ```
 
 The example uses Polars. If you're more into SQL you can use DuckDB, but the code will have the same structure: initialization, reading from S3, transforming, writing back to S3.
diff --git a/docs/core_concepts/11_persistent_storage/index.mdx b/docs/core_concepts/11_persistent_storage/index.mdx
@@ -433,7 +433,7 @@ You can link a Windmill workspace to an S3 bucket and use it as source and/or ta
 
 ```python
 #requirements:
-#polars==0.19.19
+#polars==0.20.2
 #s3fs==2023.12.0
 #wmill>=1.229.0
 
@@ -444,33 +444,30 @@ import s3fs
 
 
 def main(input_file: S3Object):
-    bucket = wmill.get_resource("u/admin/windmill-cloud-demo")["bucket"]
+    bucket = wmill.get_resource("<PATH_TO_S3_RESOURCE>")["bucket"]
 
     # this will default to the workspace s3 resource
-    args = wmill.polars_connection_settings().s3fs_args
+    storage_options = wmill.polars_connection_settings().storage_options
     # this will use the designated resource
-    # args = wmill.polars_connection_settings("<PATH_TO_S3_RESOURCE>").s3fs_args
-    s3 = s3fs.S3FileSystem(**args)
+    # storage_options = wmill.polars_connection_settings("<PATH_TO_S3_RESOURCE>").storage_options
 
+    # input is a parquet file, we use read_parquet in lazy mode.
+    # Polars can read various file types, see
+    # https://pola-rs.github.io/polars/py-polars/html/reference/io.html
     input_uri = "s3://{}/{}".format(bucket, input_file["s3"])
-    output_file = "output/result.parquet"
-    output_uri = "s3://{}/{}".format(bucket, output_file)
+    input_df = pl.read_parquet(input_uri, storage_options=storage_options).lazy()
 
-    with (
-        s3.open(input_uri, mode="rb") as input_s3,
-        s3.open(output_uri, mode="wb") as output_s3,
-    ):
-        # input is a parquet file, we use read_parquet in lazy mode.
-        # Polars can read various file types, see
-        # https://pola-rs.github.io/polars/py-polars/html/reference/io.html
-        input_df = pl.read_parquet(input_s3).lazy()
-
-        # process the Polars dataframe. See Polars docs:
-        # for dataframe: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html
-        # for lazy dataframe: https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html
-        output_df = input_df.collect()
-        print(output_df)
+    # process the Polars dataframe. See Polars docs:
+    # for dataframe: https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html
+    # for lazy dataframe: https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html
+    output_df = input_df.collect()
+    print(output_df)
 
+    # To write back the result to S3, Polars needs an s3fs connection
+    s3 = s3fs.S3FileSystem(**wmill.polars_connection_settings().s3fs_args)
+    output_file = "output/result.parquet"
+    output_uri = "s3://{}/{}".format(bucket, output_file)
+    with s3.open(output_uri, mode="wb") as output_s3:
         # persist the output dataframe back to S3 and return it
         output_df.write_parquet(output_s3)