Skip to content

Commit e0258d7

Browse files
dataflow: add GPU sample (GoogleCloudPlatform#5067)
* Add Dataflow GPU example * Minor clean ups * Fix performance issue * Allow custom base image * Pull image from nvcr.io * Use = for ENV for consistency * use cudnn image * symlink pip to /usr/local/bin/pip * Install python with conda in nvidia/cuda * Use multi stage build to optimize size * Fix tensorflow certificates * Use nvidia debian image and optimize pip installs * Simplified Dockerfile * Dockerfile cleanup * Import cuda/compat through ldconfig * Fix command line arguments * Reordered instructions * Update README * Make lint pass * Minor style fixes * Fix type * Add requirements-test.txt * Use tensorflow-gpu image * Update pip * Remove type hints * Add version note * Simplified command line arguments * Simplified comments * Use only Python 3.6 * Add Python 3.6 installation instructions * Update option to worker_machine_type * Fix lint issue * Not use cache in pip install * Remove logging info * Use noxfile_config * Link to the docs * Resolved review comments * Minor style fixes * Add e2e test * Minor style changes * Copy /opt/apache/beam * Uncomment GPU experiment flags * Reword pydocs * Explain why ignoring Python versions * Reword comments * Make gpu_type a variable * Make GPU check a fatal error * Add flag to make GPUs required * Fix missing type annotation * Reworded transform labels * Use us-east1 region * Modify for testing * Validate GPUs in parallel * Change zones and try versions with docker * Fix docker commands * Remove -t from docker commands * Configure docker * Use us-central1 and update versions * Style changes * Add latest tag to image * Use subprocess stdout * Make fixtures scoped by session * Enabled creation/deletion of resources * Removed GroupByKey * Shortened transform labels * Changed GPU check order * Ignore Python 3.9 * Add gpu_required pydocs * Run e2e test with all files * Style changes * Make GPUs required by default * Made GPU warning and error mutually exclusive * Force delete bucket. Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * Don't delete files, the delete force takes care of it * Set up an explicit temp_location Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
1 parent a9e8b53 commit e0258d7

File tree

9 files changed

+640
-0
lines changed

9 files changed

+640
-0
lines changed

dataflow/gpu-workers/.dockerignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore files for docker.
2+
env/
3+
outputs/

dataflow/gpu-workers/.gcloudignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore files for gcloud like Cloud Build.
2+
env/
3+
outputs/

dataflow/gpu-workers/Dockerfile

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM tensorflow/tensorflow:2.4.0-gpu
16+
17+
WORKDIR /root
18+
19+
# Installing the requirements here makes the workers start faster since they
20+
# don't need to install the requirements at runtime.
21+
# ℹ️ Make sure your requirements.txt includes `apache-beam[gcp]`.
22+
COPY requirements.txt .
23+
24+
# Don't install `tensorflow` since the base image already comes with it.
25+
RUN egrep -v '^tensorflow([=<>]|$)' requirements.txt > /tmp/requirements.txt \
26+
&& pip install --no-cache-dir -U pip \
27+
&& pip install --no-cache-dir -r /tmp/requirements.txt
28+
29+
# Copy the Apache Beam required files from the Beam Python SDK image.
30+
COPY --from=apache/beam_python3.6_sdk:2.26.0 /opt/apache/beam /opt/apache/beam
31+
32+
# Set the entrypoint to Apache Beam SDK worker launcher.
33+
ENTRYPOINT [ "/opt/apache/beam/boot" ]

dataflow/gpu-workers/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Workers with GPUs
2+
3+
[![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataflow/gpu-workers/README.md)
4+
5+
📝 Tutorial: [Processing Landsat satellite images with GPUs](https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus)

dataflow/gpu-workers/e2e_test.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2020 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import os
18+
import subprocess
19+
import uuid
20+
21+
from google.cloud import storage
22+
import pytest
23+
24+
SUFFIX = uuid.uuid4().hex[0:6]
25+
PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
26+
BUCKET_NAME = f"dataflow-gpu-test-{SUFFIX}"
27+
IMAGE_NAME = f"gcr.io/{PROJECT}/dataflow/gpu-workers/test-{SUFFIX}:latest"
28+
29+
30+
@pytest.fixture(scope="session")
31+
def bucket_name() -> str:
32+
storage_client = storage.Client()
33+
bucket = storage_client.create_bucket(BUCKET_NAME)
34+
35+
yield BUCKET_NAME
36+
37+
bucket.delete(force=True)
38+
39+
40+
@pytest.fixture(scope="session")
41+
def image_name() -> str:
42+
subprocess.run(
43+
[
44+
"gcloud",
45+
"builds",
46+
"submit",
47+
f"--project={PROJECT}",
48+
f"--tag={IMAGE_NAME}",
49+
"--timeout=30m",
50+
"--quiet",
51+
],
52+
check=True,
53+
)
54+
55+
yield IMAGE_NAME
56+
57+
subprocess.run(
58+
[
59+
"gcloud",
60+
"container",
61+
"images",
62+
"delete",
63+
IMAGE_NAME,
64+
f"--project={PROJECT}",
65+
"--quiet",
66+
],
67+
check=True,
68+
)
69+
70+
71+
@pytest.fixture(scope="session")
72+
def configure_docker() -> None:
73+
subprocess.run(
74+
[
75+
"gcloud",
76+
"auth",
77+
"configure-docker",
78+
]
79+
)
80+
81+
82+
def test_python_version(image_name: str, configure_docker: None) -> None:
83+
# Make sure the local and Docker Python versions are the same.
84+
# If this test fails, the following needs updating:
85+
# - noxfile_config.py: The Python 'ignored_versions' should only allow the Dockerfile Python version.
86+
# - Dockerfile: The `COPY --from=apache/beam` for the worker boot file.
87+
# - Docs tutorial: https://cloud.google.com/dataflow/docs/samples/satellite-images-gpus
88+
python_version = (
89+
subprocess.run(
90+
[
91+
"docker",
92+
"run",
93+
"--rm",
94+
"-i",
95+
"--entrypoint=bash",
96+
image_name,
97+
"-c",
98+
"python --version",
99+
],
100+
stdout=subprocess.PIPE,
101+
check=True,
102+
)
103+
.stdout.decode("utf-8")
104+
.strip()
105+
)
106+
assert python_version == "Python 3.6.9"
107+
108+
109+
def test_apache_beam_version(image_name: str, configure_docker: None) -> None:
110+
# Make sure the installed Apache Beam version matches the Apache Beam image
111+
# we use to copy the worker boot file.
112+
# If this test fails, the following needs updating:
113+
# - Dockerfile: The `COPY --from=apache/beam` for the worker boot file.
114+
apache_beam_version = (
115+
subprocess.run(
116+
[
117+
"docker",
118+
"run",
119+
"--rm",
120+
"-i",
121+
"--entrypoint=bash",
122+
image_name,
123+
"-c",
124+
"pip freeze | egrep '^apache-beam=='",
125+
],
126+
stdout=subprocess.PIPE,
127+
check=True,
128+
)
129+
.stdout.decode("utf-8")
130+
.strip()
131+
)
132+
assert apache_beam_version == "apache-beam==2.26.0"
133+
134+
135+
def test_tensorflow_version(image_name: str, configure_docker: None) -> None:
136+
# Make sure the installed Tensorflow version matches the Tensorflow version
137+
# in the Dockerfile.
138+
# If this test fails, the following needs updating:
139+
# - Dockerfile: The `FROM tensorflow/tensorflow` version.
140+
tensorflow_version = (
141+
subprocess.run(
142+
[
143+
"docker",
144+
"run",
145+
"--rm",
146+
"-i",
147+
"--entrypoint=bash",
148+
image_name,
149+
"-c",
150+
"pip freeze | egrep '^tensorflow(-gpu)?=='",
151+
],
152+
stdout=subprocess.PIPE,
153+
check=True,
154+
)
155+
.stdout.decode("utf-8")
156+
.strip()
157+
)
158+
assert tensorflow_version == "tensorflow-gpu==2.4.0"
159+
160+
161+
def test_end_to_end(bucket_name: str, image_name: str) -> None:
162+
# Run the Beam pipeline in Dataflow making sure GPUs are used.
163+
gpu_type = "nvidia-tesla-t4"
164+
region = "us-central1"
165+
worker_zone = "us-central1-a"
166+
subprocess.run(
167+
[
168+
"python",
169+
"landsat_view.py",
170+
f"--output-path-prefix=gs://{bucket_name}/outputs/",
171+
"--runner=DataflowRunner",
172+
f"--project={PROJECT}",
173+
f"--region={region}",
174+
f"--temp_location=gs://{bucket_name}/temp",
175+
"--worker_machine_type=custom-1-13312-ext",
176+
f"--worker_harness_container_image={image_name}",
177+
f"--worker_zone={worker_zone}",
178+
f"--experiments=worker_accelerator=type={gpu_type},count=1,install-nvidia-driver",
179+
"--experiments=use_runner_v2",
180+
],
181+
check=True,
182+
)
183+
184+
# Check that output files were created and are not empty.
185+
storage_client = storage.Client()
186+
output_files = list(storage_client.list_blobs(
187+
bucket_name, prefix="outputs/"))
188+
assert len(output_files) > 0, "No output files found"
189+
for output_file in output_files:
190+
assert output_file.size > 0, f"Output file is empty: {output_file.name}"

0 commit comments

Comments
 (0)