Skip to content

Commit edb6dde

Browse files
authored
chore: update S3 snippets (windmill-labs#449)
1 parent 9c3c54b commit edb6dde

File tree

2 files changed

+66
-43
lines changed
  • blog/2023-11-24-data-pipeline-orchestrator
  • docs/core_concepts/11_persistent_storage

2 files changed

+66
-43
lines changed

blog/2023-11-24-data-pipeline-orchestrator/index.mdx

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,16 +158,22 @@ And more to come! With both Windmill providing the boilerplate code, and Polars
158158
In the end, a canonical pipeline step in Windmill will look something like this:
159159

160160
```python
161+
#requirements:
162+
#polars==0.19.19
163+
#s3fs==2023.12.0
164+
#wmill>=1.229.0
165+
161166
import polars as pl
162167
import s3fs
163168
import datetime
164169
import wmill
165170

166-
s3object = dict
167-
def main(input_dataset: s3object):
171+
172+
def main(input_dataset: S3Object):
168173
# initialization: connect Polars to the workspace bucket
169174
s3_resource = wmill.get_resource("/path/to/resource")
170-
s3 = s3fs.S3FileSystem(wmill.polars_connection_settings("/path/to/resource")["s3fs_args"])
175+
s3fs_args = wmill.polars_connection_settings().s3fs_args
176+
s3 = s3fs.S3FileSystem(**s3fs_args)
171177

172178
# reading data from s3:
173179
bucket = s3_resource["bucket"]
@@ -194,9 +200,7 @@ def main(input_dataset: s3object):
194200
output.write_parquet(output_dataset)
195201

196202
# returning the URI of the output for next steps to process it
197-
return s3object({
198-
"s3": output_dataset_uri
199-
})
203+
return S3Object(s3=output_dataset_uri)
200204
```
201205

202206
The example uses Polars. If you're more into SQL you can use DuckDB, but the code will have the same structure: initialization, reading from S3, transforming, writing back to S3.

docs/core_concepts/11_persistent_storage/index.mdx

Lines changed: 56 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -333,18 +333,21 @@ Then from Windmill, just [fill the S3 resource type](../../integrations/s3.md).
333333
<TabItem value="deno" label="TypeScript (Deno)" attributes={{className: "text-xs p-4 !mt-0 !ml-0"}}>
334334

335335
```ts
336-
import * as wmill from 'npm:windmill-client@1';
336+
import type { S3Object } from 'npm:windmill-client@^1.229.0';
337+
import * as wmill from 'npm:windmill-client@^1.229.0';
337338
import { S3Client } from 'https://deno.land/x/s3_lite_client@0.2.0/mod.ts';
338339

339-
type s3object = object;
340+
export async function main(inputFile: S3Object) {
341+
// this will default to the workspace s3 resource
342+
let args = await wmill.denoS3LightClientSettings();
343+
// this will use the designated resource
344+
// let args = await wmill.denoS3LightClientSettings("<PATH_TO_S3_RESOURCE>");
345+
const s3Client = new S3Client(args);
340346

341-
export async function main(inputFile: s3object) {
342-
const s3Resource = await wmill.getResource('<PATH_TO_S3_RESOURCE>');
343-
const s3Client = new S3Client(s3Resource);
344347
const outputFile = 'output/hello.txt';
345348

346349
// read object from S3
347-
const getObjectResponse = await s3Client.getObject(inputFile['s3']);
350+
const getObjectResponse = await s3Client.getObject(inputFile.s3);
348351
const inputObjContent = await getObjectResponse.text();
349352
console.log(inputObjContent);
350353

@@ -356,31 +359,35 @@ export async function main(inputFile: s3object) {
356359
console.log(obj.key);
357360
}
358361

359-
return {
362+
const result: S3Object = {
360363
s3: outputFile
361364
};
365+
return result;
362366
}
363367
```
364368

365369
</TabItem>
366370
<TabItem value="python" label="Python" attributes={{className: "text-xs p-4 !mt-0 !ml-0"}}>
367371

368372
```python
373+
#requirements:
374+
#boto3==1.34.4
375+
#wmill>=1.229.0
376+
369377
import wmill
378+
from wmill import S3Object
370379
import boto3
371380

372-
s3object = dict
373381

382+
def main(input_file: S3Object):
383+
bucket = wmill.get_resource("<PATH_TO_S3_RESOURCE>")["bucket"]
384+
385+
# this will default to the workspace s3 resource
386+
args = wmill.boto3_connection_settings()
387+
# this will use the designated resource
388+
# args = wmill.boto3_connection_settings("<PATH_TO_S3_RESOURCE>")
389+
s3client = boto3.client("s3", **args)
374390

375-
def main(input_file: s3object):
376-
s3_resource = wmill.get_resource("<PATH_TO_S3_RESOURCE>")
377-
bucket = s3_resource["bucket"]
378-
s3client = boto3.client(
379-
"s3",
380-
region_name=s3_resource["region"],
381-
aws_access_key_id=s3_resource["accessKey"],
382-
aws_secret_access_key=s3_resource["secretKey"],
383-
)
384391
output_file = "output/hello.txt"
385392

386393
# read object from S3 and print its content
@@ -406,10 +413,9 @@ def main(input_file: s3object):
406413
# see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html
407414
# and https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html
408415
# for more code examples (listing object, deleting files, etc)
409-
410416
return [
411-
s3object({"s3": output_file}),
412-
s3object({"s3": uploaded_file}),
417+
S3Object(s3=output_file),
418+
S3Object(s3=uploaded_file),
413419
]
414420
```
415421

@@ -426,22 +432,26 @@ You can link a Windmill workspace to an S3 bucket and use it as source and/or ta
426432
<TabItem value="polars" label="Polars" attributes={{className: "text-xs p-4 !mt-0 !ml-0"}}>
427433

428434
```python
435+
#requirements:
436+
#polars==0.19.19
437+
#s3fs==2023.12.0
438+
#wmill>=1.229.0
439+
429440
import wmill
441+
from wmill import S3Object
430442
import polars as pl
431443
import s3fs
432444

433-
s3object = dict
434445

446+
def main(input_file: S3Object):
447+
bucket = wmill.get_resource("u/admin/windmill-cloud-demo")["bucket"]
435448

436-
def main(input_file: s3object):
437-
s3 = s3fs.S3FileSystem(
438-
# this will default to the workspace s3 resource
439-
**wmill.polars_connection_settings()["s3fs_args"]
440-
# this will use the designated resource
441-
# **wmill.polars_connection_settings("<PATH_TO_S3_RESOURCE>")["s3fs_args"]
442-
)
449+
# this will default to the workspace s3 resource
450+
args = wmill.polars_connection_settings().s3fs_args
451+
# this will use the designated resource
452+
# args = wmill.polars_connection_settings("<PATH_TO_S3_RESOURCE>").s3fs_args
453+
s3 = s3fs.S3FileSystem(**args)
443454

444-
bucket = "<S3_BUCKET_NAME>"
445455
input_uri = "s3://{}/{}".format(bucket, input_file["s3"])
446456
output_file = "output/result.parquet"
447457
output_uri = "s3://{}/{}".format(bucket, output_file)
@@ -463,29 +473,38 @@ def main(input_file: s3object):
463473

464474
# persist the output dataframe back to S3 and return it
465475
output_df.write_parquet(output_s3)
466-
return s3object({"s3": output_file})
476+
477+
return S3Object(s3=output_file)
467478
```
468479

469480
</TabItem>
470481
<TabItem value="duckdb" label="DuckDB" attributes={{className: "text-xs p-4 !mt-0 !ml-0"}}>
471482

472483
```python
484+
#requirements:
485+
#wmill>=1.229.0
486+
#duckdb==0.9.1
487+
473488
import wmill
489+
from wmill import S3Object
474490
import duckdb
475491

476-
s3object = dict
477492

493+
def main(input_file: S3Object):
494+
bucket = wmill.get_resource("u/admin/windmill-cloud-demo")["bucket"]
478495

479-
def main(input_file: s3object):
480496
# create a DuckDB database in memory
481497
# see https://duckdb.org/docs/api/python/dbapi
482498
conn = duckdb.connect()
483-
# connect duck db to the S3 bucket - this will default to the workspace s3 resource
484-
conn.execute(wmill.duckdb_connection_settings()["connection_settings_str"])
499+
500+
# this will default to the workspace s3 resource
501+
args = wmill.duckdb_connection_settings().connection_settings_str
485502
# this will use the designated resource
486-
# conn.execute(wmill.duckdb_connection_settings("<PATH_TO_S3_RESOURCE>")["connection_settings_str"])
503+
# args = wmill.duckdb_connection_settings("<PATH_TO_S3_RESOURCE>").connection_settings_str
504+
505+
# connect duck db to the S3 bucket - this will default to the workspace s3 resource
506+
conn.execute(args)
487507

488-
bucket = "<S3_BUCKET_NAME>"
489508
input_uri = "s3://{}/{}".format(bucket, input_file["s3"])
490509
output_file = "output/result.parquet"
491510
output_uri = "s3://{}/{}".format(bucket, output_file)
@@ -512,7 +531,7 @@ def main(input_file: s3object):
512531
)
513532

514533
conn.close()
515-
return s3object({"s3": output_file})
534+
return S3Object(s3=output_file)
516535
```
517536

518537
</TabItem>

0 commit comments

Comments
 (0)