YARN-8800. Updated documentation of Submarine with latest examples. Contributed by Wangda Tan.

This commit is contained in:
Sunil G 2018-09-29 00:01:04 +05:30
parent 72891fc9be
commit 19ad5be651
30 changed files with 3235 additions and 12 deletions

View File

@ -180,6 +180,12 @@
<item name="System Services" href="hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html"/>
</menu>
<menu name="Submarine" inherit="top">
<item name="Index" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Index.html"/>
<item name="QuickStart" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/QuickStart.html"/>
<item name="Examples" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Examples.html"/>
</menu>
<menu name="Hadoop Compatible File Systems" inherit="top">
<item name="Aliyun OSS" href="hadoop-aliyun/tools/hadoop-aliyun/index.html"/>
<item name="Amazon S3" href="hadoop-aws/tools/hadoop-aws/index.html"/>

View File

@ -48,6 +48,8 @@ Goals of Submarine:
- Support launch tensorboard for training jobs if user specified.
- Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
Please jump to [QuickStart](src/site/QuickStart.md) guide to quickly understand how to use this framework.
Please jump to [QuickStart](src/site/markdown/QuickStart.md) guide to quickly understand how to use this framework.
If you're a developer, please find [Developer](src/site/DeveloperGuide.md) guide for more details.
Please jump to [Examples](src/site/markdown/Examples.md) to try other examples like running Distributed Tensorflow Training for CIFAR 10.
If you're a developer, please find [Developer](src/site/markdown/DeveloperGuide.md) guide for more details.

View File

@ -0,0 +1,69 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM ubuntu:16.04
LABEL maintainer="Craig Citro <craigcitro@google.com>"
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
# These lines will be edited automatically by parameterized_docker_build.sh. #
# COPY _PIP_FILE_ /
# RUN pip --no-cache-dir install /_PIP_FILE_
# RUN rm -f /_PIP_FILE_
# Install TensorFlow CPU version from central repo
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install git -y
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
RUN tar zxf hadoop-3.1.1.tar.gz

View File

@ -0,0 +1,67 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
build-essential \
cuda-command-line-tools-9-0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
curl \
libcudnn7=7.0.5.15-1+cuda9.0 \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
# Install TensorFlow GPU version.
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
RUN apt-get update && apt-get install git -y
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
RUN tar zxf hadoop-3.1.0.tar.gz

View File

@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Building base images"
set -e
cd base/ubuntu-16.04
docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu-base:0.0.1
docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu-base:0.0.1
echo "Finished building base images"
cd ../../with-cifar10-models/ubuntu-16.04
docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu:0.0.1
docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu:0.0.1

View File

@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM tf-1.8.0-cpu-base:0.0.1
# Include models
RUN mkdir /test
ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
RUN chown -R nobody /test

View File

@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM tf-1.8.0-gpu-base:0.0.1
# Include models
RUN mkdir /test
ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
RUN chown -R nobody /test

View File

@ -0,0 +1,542 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
(Copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator)
CIFAR-10 is a common benchmark in machine learning for image recognition.
http://www.cs.toronto.edu/~kriz/cifar.html
Code in this directory focuses on how to use TensorFlow Estimators to train and
evaluate a CIFAR-10 ResNet model on:
* A single host with one CPU;
* A single host with multiple GPUs;
* Multiple hosts with CPU or multiple GPUs;
Before trying to run the model we highly encourage you to read all the README.
## Prerequisite
1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.2.1 or
later.
2. Download the CIFAR-10 dataset and generate TFRecord files using the provided
script. The script and associated command below will download the CIFAR-10
dataset and then generate a TFRecord for the training, validation, and
evaluation datasets.
```shell
python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data
```
After running the command above, you should see the following files in the
--data-dir (```ls -R cifar-10-data```):
* train.tfrecords
* validation.tfrecords
* eval.tfrecords
## Training on a single machine with GPUs or CPU
Run the training on CPU only. After training, it runs the evaluation.
```
python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
--job-dir=/tmp/cifar10 \
--num-gpus=0 \
--train-steps=1000
```
Run the model on 2 GPUs using CPU as parameter server. After training, it runs
the evaluation.
```
python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
--job-dir=/tmp/cifar10 \
--num-gpus=2 \
--train-steps=1000
```
Run the model on 2 GPUs using GPU as parameter server.
It will run an experiment, which for local setting basically means it will run
stop training
a couple of times to perform evaluation.
```
python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
--job-dir=/tmp/cifar10 \
--variable-strategy GPU \
--num-gpus=2 \
```
There are more command line flags to play with; run
`python cifar10_main.py --help` for details.
## Run distributed training
### (Optional) Running on Google Cloud Machine Learning Engine
This example can be run on Google Cloud Machine Learning Engine (ML Engine),
which will configure the environment and take care of running workers,
parameters servers, and masters in a fault tolerant way.
To install the command line tool, and set up a project and billing, see the
quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line).
You'll also need a Google Cloud Storage bucket for the data. If you followed the
instructions above, you can just run:
```
MY_BUCKET=gs://<my-bucket-name>
gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/
```
Then run the following command from the `tutorials/image` directory of this
repository (the parent directory of this README):
```
gcloud ml-engine jobs submit training cifarmultigpu \
--runtime-version 1.2 \
--job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \
--config cifar10_estimator/cmle_config.yaml \
--package-path cifar10_estimator/ \
--module-name cifar10_estimator.cifar10_main \
-- \
--data-dir=$MY_BUCKET/cifar-10-data \
--num-gpus=4 \
--train-steps=1000
```
### Set TF_CONFIG
Considering that you already have multiple hosts configured, all you need is a
`TF_CONFIG` environment variable on each host. You can set up the hosts manually
or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for
instructions about how to set up a Cluster.
The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and
their task: `master`, `ps` or `worker`.
Here's an example of `TF_CONFIG`.
```python
cluster = {'master': ['master-ip:8000'],
'ps': ['ps-ip:8000'],
'worker': ['worker-ip:8000']}
TF_CONFIG = json.dumps(
{'cluster': cluster,
'task': {'type': master, 'index': 0},
'model_dir': 'gs://<bucket_path>/<dir_path>',
'environment': 'cloud'
})
```
*Cluster*
A cluster spec, which is basically a dictionary that describes all of the tasks
in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed).
In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker.
* `ps`: saves the parameters among all workers. All workers can
read/write/update the parameters for model via ps. As some models are
extremely large the parameters are shared among the ps (each ps stores a
subset).
* `worker`: does the training.
* `master`: basically a special worker, it does training, but also restores and
saves checkpoints and do evaluation.
*Task*
The Task defines what is the role of the current node, for this example the node
is the master on index 0 on the cluster spec, the task will be different for
each node. An example of the `TF_CONFIG` for a worker would be:
```python
cluster = {'master': ['master-ip:8000'],
'ps': ['ps-ip:8000'],
'worker': ['worker-ip:8000']}
TF_CONFIG = json.dumps(
{'cluster': cluster,
'task': {'type': worker, 'index': 0},
'model_dir': 'gs://<bucket_path>/<dir_path>',
'environment': 'cloud'
})
```
*Model_dir*
This is the path where the master will save the checkpoints, graph and
TensorBoard files. For a multi host environment you may want to use a
Distributed File System, Google Storage and DFS are supported.
*Environment*
By the default environment is *local*, for a distributed setting we need to
change it to *cloud*.
### Running script
Once you have a `TF_CONFIG` configured properly on each host you're ready to run
on distributed settings.
#### Master
Run this on master:
Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
40000 steps. It will run evaluation a couple of times during training. The
num_workers arugument is used only to update the learning rate correctly. Make
sure the model_dir is the same as defined on the TF_CONFIG.
```shell
python cifar10_main.py --data-dir=gs://path/cifar-10-data \
--job-dir=gs://path/model_dir/ \
--num-gpus=4 \
--train-steps=40000 \
--sync \
--num-workers=2
```
*Output:*
```shell
INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd16fb2be10>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
gpu_options {
}
allow_soft_placement: true
, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
...
2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:00:04.0
Total memory: 11.17GiB
Free memory: 11.09GiB
2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:00:05.0
Total memory: 11.17GiB
Free memory: 11.10GiB
...
2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0
2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config:
intra_op_parallelism_threads: 1
gpu_options {
per_process_gpu_memory_fraction: 1
}
allow_soft_placement: true
INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt.
INFO:tensorflow:loss = 1.20682, step = 1
INFO:tensorflow:loss = 1.20682, learning_rate = 0.1
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14
2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0)
2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0)
2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0)
2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0)
2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0)
2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0)
2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0)
2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0)
INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023
INFO:tensorflow:Evaluation [1/100]
INFO:tensorflow:Evaluation [2/100]
INFO:tensorflow:Evaluation [3/100]
INFO:tensorflow:Evaluation [4/100]
INFO:tensorflow:Evaluation [5/100]
INFO:tensorflow:Evaluation [6/100]
INFO:tensorflow:Evaluation [7/100]
INFO:tensorflow:Evaluation [8/100]
INFO:tensorflow:Evaluation [9/100]
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [11/100]
INFO:tensorflow:Evaluation [12/100]
INFO:tensorflow:Evaluation [13/100]
...
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31
INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425
```
#### Worker
Run this on worker:
Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
40000 steps. It will run evaluation a couple of times during training. Make sure
the model_dir is the same as defined on the TF_CONFIG.
```shell
python cifar10_main.py --data-dir=gs://path/cifar-10-data \
--job-dir=gs://path/model_dir/ \
--num-gpus=4 \
--train-steps=40000 \
--sync
```
*Output:*
```shell
INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600,
'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker',
'_is_chief': False, '_cluster_spec':
<tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6918438e10>,
'_model_dir': 'gs://<path>/model_dir/',
'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000,
'_session_config': intra_op_parallelism_threads: 1
gpu_options {
}
allow_soft_placement: true
, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1,
'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1.0
}
...
2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:00:04.0
Total memory: 11.17GiB
Free memory: 11.09GiB
2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:00:05.0
Total memory: 11.17GiB
Free memory: 11.10GiB
...
2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
INFO:tensorflow:Create CheckpointSaverHook.
2017-07-31 22:38:04.629150: I
tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting
for response from worker: /job:master/replica:0/task:0
2017-07-31 22:38:09.263492: I
tensorflow/core/distributed_runtime/master_session.cc:999] Start master
session cc58f93b1e259b0c with config:
intra_op_parallelism_threads: 1
gpu_options {
per_process_gpu_memory_fraction: 1
}
allow_soft_placement: true
INFO:tensorflow:loss = 5.82382, step = 0
INFO:tensorflow:loss = 5.82382, learning_rate = 0.8
INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10
INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20
INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30
INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40
INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50
INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60
INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70
INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec)
INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec)
INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80
INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90
INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100
INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110
INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120
INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130
INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140
INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150
INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160
INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170
INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
...
```
#### PS
Run this on ps:
The ps will not do training so most of the arguments won't affect the execution
```shell
python cifar10_main.py --job-dir=gs://path/model_dir/
```
*Output:*
```shell
INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f48f1addf90>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
gpu_options {
}
allow_soft_placement: true
, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000}
2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000}
2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
```
## Visualizing results with TensorBoard
When using Estimators you can also visualize your data in TensorBoard, with no
changes in your code. You can use TensorBoard to visualize your TensorFlow
graph, plot quantitative metrics about the execution of your graph, and show
additional data like images that pass through it.
You'll see something similar to this if you "point" TensorBoard to the
`job dir` parameter you used to train or evaluate your model.
Check TensorBoard during training or after it. Just point TensorBoard to the
model_dir you chose on the previous step.
```shell
tensorboard --log-dir="<job dir>"
```
## Warnings
When runninng `cifar10_main.py` with `--sync` argument you may see an error
similar to:
```python
File "cifar10_main.py", line 538, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "cifar10_main.py", line 518, in main
hooks), run_config=config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run
return _execute_schedule(experiment, schedule)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule
return task()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate
hooks=self._eval_hooks)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate
hooks=hooks)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model
features, labels, model_fn_lib.ModeKeys.EVAL)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn
features=features, labels=labels, **kwargs)
File "cifar10_main.py", line 331, in _resnet_model_fn
gradvars, global_step=tf.train.get_global_step())
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients
variables.global_variables())
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped
return _add_should_use_warning(fn(*args, **kwargs))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning
wrapped = TFShouldUseWarningWrapper(x)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__
stack = [s.strip() for s in traceback.format_stack()]
```
This should not affect your training, and should be fixed on the next releases.

View File

@ -0,0 +1,113 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CIFAR-10 data set.
See http://www.cs.toronto.edu/~kriz/cifar.html.
"""
import os
import tensorflow as tf
HEIGHT = 32
WIDTH = 32
DEPTH = 3
class Cifar10DataSet(object):
"""Cifar10 data set.
Described by http://www.cs.toronto.edu/~kriz/cifar.html.
"""
def __init__(self, data_dir, subset='train', use_distortion=True):
self.data_dir = data_dir
self.subset = subset
self.use_distortion = use_distortion
def get_filenames(self):
if self.subset in ['train', 'validation', 'eval']:
return [os.path.join(self.data_dir, self.subset + '.tfrecords')]
else:
raise ValueError('Invalid data subset "%s"' % self.subset)
def parser(self, serialized_example):
"""Parses a single tf.Example into image and label tensors."""
# Dimensions of the images in the CIFAR-10 dataset.
# See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
# input format.
features = tf.parse_single_example(
serialized_example,
features={
'image': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
image = tf.decode_raw(features['image'], tf.uint8)
image.set_shape([DEPTH * HEIGHT * WIDTH])
# Reshape from [depth * height * width] to [depth, height, width].
image = tf.cast(
tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
tf.float32)
label = tf.cast(features['label'], tf.int32)
# Custom preprocessing.
image = self.preprocess(image)
return image, label
def make_batch(self, batch_size):
"""Read the images and labels from 'filenames'."""
filenames = self.get_filenames()
# Repeat infinitely.
dataset = tf.data.TFRecordDataset(filenames).repeat()
# Parse records.
dataset = dataset.map(
self.parser)
# Potentially shuffle records.
if self.subset == 'train':
min_queue_examples = int(
Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4)
# Ensure that the capacity is sufficiently large to provide good random
# shuffling.
dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)
# Batch it up.
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
image_batch, label_batch = iterator.get_next()
return image_batch, label_batch
def preprocess(self, image):
"""Preprocess a single image in [height, width, depth] layout."""
if self.subset == 'train' and self.use_distortion:
# Pad 4 pixels on each dimension of feature map, done in mini-batch
image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
image = tf.image.random_flip_left_right(image)
return image
@staticmethod
def num_examples_per_epoch(subset='train'):
if subset == 'train':
return 45000
elif subset == 'validation':
return 5000
elif subset == 'eval':
return 10000
else:
raise ValueError('Invalid data subset "%s"' % subset)

View File

@ -0,0 +1,521 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet model for classifying images from CIFAR-10 dataset.
Support single-host training with one or multiple devices.
ResNet as proposed in:
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Deep Residual Learning for Image Recognition. arXiv:1512.03385
CIFAR-10 as in:
http://www.cs.toronto.edu/~kriz/cifar.html
"""
from __future__ import division
from __future__ import print_function
import argparse
import functools
import itertools
import os
import cifar10
import cifar10_model
import cifar10_utils
import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.INFO)
def get_model_fn(num_gpus, variable_strategy, num_workers):
"""Returns a function that will build the resnet model."""
def _resnet_model_fn(features, labels, mode, params):
"""Resnet model body.
Support single host, one or more GPU training. Parameter distribution can
be either one of the following scheme.
1. CPU is the parameter server and manages gradient updates.
2. Parameters are distributed evenly across all GPUs, and the first GPU
manages gradient updates.
Args:
features: a list of tensors, one for each tower
labels: a list of tensors, one for each tower
mode: ModeKeys.TRAIN or EVAL
params: Hyperparameters suitable for tuning
Returns:
A EstimatorSpec object.
"""
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
weight_decay = params.weight_decay
momentum = params.momentum
tower_features = features
tower_labels = labels
tower_losses = []
tower_gradvars = []
tower_preds = []
# channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
# on CPU. The exception is Intel MKL on CPU which is optimal with
# channels_last.
data_format = params.data_format
if not data_format:
if num_gpus == 0:
data_format = 'channels_last'
else:
data_format = 'channels_first'
if num_gpus == 0:
num_devices = 1
device_type = 'cpu'
else:
num_devices = num_gpus
device_type = 'gpu'
for i in range(num_devices):
worker_device = '/{}:{}'.format(device_type, i)
if variable_strategy == 'CPU':
device_setter = cifar10_utils.local_device_setter(
worker_device=worker_device)
elif variable_strategy == 'GPU':
device_setter = cifar10_utils.local_device_setter(
ps_device_type='gpu',
worker_device=worker_device,
ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
num_gpus, tf.contrib.training.byte_size_load_fn))
with tf.variable_scope('resnet', reuse=bool(i != 0)):
with tf.name_scope('tower_%d' % i) as name_scope:
with tf.device(device_setter):
loss, gradvars, preds = _tower_fn(
is_training, weight_decay, tower_features[i], tower_labels[i],
data_format, params.num_layers, params.batch_norm_decay,
params.batch_norm_epsilon)
tower_losses.append(loss)
tower_gradvars.append(gradvars)
tower_preds.append(preds)
if i == 0:
# Only trigger batch_norm moving mean and variance update from
# the 1st tower. Ideally, we should grab the updates from all
# towers but these stats accumulate extremely fast so we can
# ignore the other stats from the other towers without
# significant detriment.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
name_scope)
# Now compute global loss and gradients.
gradvars = []
with tf.name_scope('gradient_averaging'):
all_grads = {}
for grad, var in itertools.chain(*tower_gradvars):
if grad is not None:
all_grads.setdefault(var, []).append(grad)
for var, grads in six.iteritems(all_grads):
# Average gradients on the same device as the variables
# to which they apply.
with tf.device(var.device):
if len(grads) == 1:
avg_grad = grads[0]
else:
avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
gradvars.append((avg_grad, var))
# Device that runs the ops to apply global gradient updates.
consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
with tf.device(consolidation_device):
# Suggested learning rate scheduling from
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
'train') // (params.train_batch_size * num_workers)
boundaries = [
num_batches_per_epoch * x
for x in np.array([82, 123, 300], dtype=np.int64)
]
staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
boundaries, staged_lr)
loss = tf.reduce_mean(tower_losses, name='loss')
examples_sec_hook = cifar10_utils.ExamplesPerSecondHook(
params.train_batch_size, every_n_steps=10)
tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=100)
train_hooks = [logging_hook, examples_sec_hook]
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate, momentum=momentum)
if params.sync:
optimizer = tf.train.SyncReplicasOptimizer(
optimizer, replicas_to_aggregate=num_workers)
sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief)
train_hooks.append(sync_replicas_hook)
# Create single grouped train op
train_op = [
optimizer.apply_gradients(
gradvars, global_step=tf.train.get_global_step())
]
train_op.extend(update_ops)
train_op = tf.group(*train_op)
predictions = {
'classes':
tf.concat([p['classes'] for p in tower_preds], axis=0),
'probabilities':
tf.concat([p['probabilities'] for p in tower_preds], axis=0)
}
stacked_labels = tf.concat(labels, axis=0)
metrics = {
'accuracy':
tf.metrics.accuracy(stacked_labels, predictions['classes'])
}
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
training_hooks=train_hooks,
eval_metric_ops=metrics)
return _resnet_model_fn
def _tower_fn(is_training, weight_decay, feature, label, data_format,
num_layers, batch_norm_decay, batch_norm_epsilon):
"""Build computation tower (Resnet).
Args:
is_training: true if is training graph.
weight_decay: weight regularization strength, a float.
feature: a Tensor.
label: a Tensor.
data_format: channels_last (NHWC) or channels_first (NCHW).
num_layers: number of layers, an int.
batch_norm_decay: decay for batch normalization, a float.
batch_norm_epsilon: epsilon for batch normalization, a float.
Returns:
A tuple with the loss for the tower, the gradients and parameters, and
predictions.
"""
model = cifar10_model.ResNetCifar10(
num_layers,
batch_norm_decay=batch_norm_decay,
batch_norm_epsilon=batch_norm_epsilon,
is_training=is_training,
data_format=data_format)
logits = model.forward_pass(feature, input_data_format='channels_last')
tower_pred = {
'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits)
}
tower_loss = tf.losses.sparse_softmax_cross_entropy(
logits=logits, labels=label)
tower_loss = tf.reduce_mean(tower_loss)
model_params = tf.trainable_variables()
tower_loss += weight_decay * tf.add_n(
[tf.nn.l2_loss(v) for v in model_params])
tower_grad = tf.gradients(tower_loss, model_params)
return tower_loss, zip(tower_grad, model_params), tower_pred
def input_fn(data_dir,
subset,
num_shards,
batch_size,
use_distortion_for_training=True):
"""Create input graph for model.
Args:
data_dir: Directory where TFRecords representing the dataset are located.
subset: one of 'train', 'validate' and 'eval'.
num_shards: num of towers participating in data-parallel training.
batch_size: total batch size for training to be divided by the number of
shards.
use_distortion_for_training: True to use distortions.
Returns:
two lists of tensors for features and labels, each of num_shards length.
"""
with tf.device('/cpu:0'):
use_distortion = subset == 'train' and use_distortion_for_training
dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion)
image_batch, label_batch = dataset.make_batch(batch_size)
if num_shards <= 1:
# No GPU available or only 1 GPU.
return [image_batch], [label_batch]
# Note that passing num=batch_size is safe here, even though
# dataset.batch(batch_size) can, in some cases, return fewer than batch_size
# examples. This is because it does so only when repeating for a limited
# number of epochs, but our dataset repeats forever.
image_batch = tf.unstack(image_batch, num=batch_size, axis=0)
label_batch = tf.unstack(label_batch, num=batch_size, axis=0)
feature_shards = [[] for i in range(num_shards)]
label_shards = [[] for i in range(num_shards)]
for i in xrange(batch_size):
idx = i % num_shards
feature_shards[idx].append(image_batch[i])
label_shards[idx].append(label_batch[i])
feature_shards = [tf.parallel_stack(x) for x in feature_shards]
label_shards = [tf.parallel_stack(x) for x in label_shards]
return feature_shards, label_shards
def get_experiment_fn(data_dir,
num_gpus,
variable_strategy,
use_distortion_for_training=True):
"""Returns an Experiment function.
Experiments perform training on several workers in parallel,
in other words experiments know how to invoke train and eval in a sensible
fashion for distributed training. Arguments passed directly to this
function are not tunable, all other arguments should be passed within
tf.HParams, passed to the enclosed function.
Args:
data_dir: str. Location of the data for input_fns.
num_gpus: int. Number of GPUs on each worker.
variable_strategy: String. CPU to use CPU as the parameter server
and GPU to use the GPUs as the parameter server.
use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
Returns:
A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
tf.contrib.learn.Experiment.
Suitable for use by tf.contrib.learn.learn_runner, which will run various
methods on Experiment (train, evaluate) based on information
about the current runner in `run_config`.
"""
def _experiment_fn(run_config, hparams):
"""Returns an Experiment."""
# Create estimator.
train_input_fn = functools.partial(
input_fn,
data_dir,
subset='train',
num_shards=num_gpus,
batch_size=hparams.train_batch_size,
use_distortion_for_training=use_distortion_for_training)
eval_input_fn = functools.partial(
input_fn,
data_dir,
subset='eval',
batch_size=hparams.eval_batch_size,
num_shards=num_gpus)
num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
if num_eval_examples % hparams.eval_batch_size != 0:
raise ValueError(
'validation set size must be multiple of eval_batch_size')
train_steps = hparams.train_steps
eval_steps = num_eval_examples // hparams.eval_batch_size
classifier = tf.estimator.Estimator(
model_fn=get_model_fn(num_gpus, variable_strategy,
run_config.num_worker_replicas or 1),
config=run_config,
params=hparams)
# Create experiment.
return tf.contrib.learn.Experiment(
classifier,
train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
train_steps=train_steps,
eval_steps=eval_steps)
return _experiment_fn
def main(job_dir, data_dir, num_gpus, variable_strategy,
use_distortion_for_training, log_device_placement, num_intra_threads,
**hparams):
# The env variable is on deprecation path, default is set to off.
os.environ['TF_SYNC_ON_FINISH'] = '0'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
# Session configuration.
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=log_device_placement,
intra_op_parallelism_threads=num_intra_threads,
gpu_options=tf.GPUOptions(force_gpu_compatible=True))
config = cifar10_utils.RunConfig(
session_config=sess_config, model_dir=job_dir)
tf.contrib.learn.learn_runner.run(
get_experiment_fn(data_dir, num_gpus, variable_strategy,
use_distortion_for_training),
run_config=config,
hparams=tf.contrib.training.HParams(
is_chief=config.is_chief,
**hparams))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data-dir',
type=str,
required=True,
help='The directory where the CIFAR-10 input data is stored.')
parser.add_argument(
'--job-dir',
type=str,
required=True,
help='The directory where the model will be stored.')
parser.add_argument(
'--variable-strategy',
choices=['CPU', 'GPU'],
type=str,
default='CPU',
help='Where to locate variable operations')
parser.add_argument(
'--num-gpus',
type=int,
default=1,
help='The number of gpus used. Uses only CPU if set to 0.')
parser.add_argument(
'--num-layers',
type=int,
default=44,
help='The number of layers of the model.')
parser.add_argument(
'--train-steps',
type=int,
default=80000,
help='The number of steps to use for training.')
parser.add_argument(
'--train-batch-size',
type=int,
default=128,
help='Batch size for training.')
parser.add_argument(
'--eval-batch-size',
type=int,
default=100,
help='Batch size for validation.')
parser.add_argument(
'--momentum',
type=float,
default=0.9,
help='Momentum for MomentumOptimizer.')
parser.add_argument(
'--weight-decay',
type=float,
default=2e-4,
help='Weight decay for convolutions.')
parser.add_argument(
'--learning-rate',
type=float,
default=0.1,
help="""\
This is the inital learning rate value. The learning rate will decrease
during training. For more details check the model_fn implementation in
this file.\
""")
parser.add_argument(
'--use-distortion-for-training',
type=bool,
default=True,
help='If doing image distortion for training.')
parser.add_argument(
'--sync',
action='store_true',
default=False,
help="""\
If present when running in a distributed environment will run on sync mode.\
""")
parser.add_argument(
'--num-intra-threads',
type=int,
default=0,
help="""\
Number of threads to use for intra-op parallelism. When training on CPU
set to 0 to have the system pick the appropriate number or alternatively
set it to the number of physical CPU cores.\
""")
parser.add_argument(
'--num-inter-threads',
type=int,
default=0,
help="""\
Number of threads to use for inter-op parallelism. If set to 0, the
system will pick an appropriate number.\
""")
parser.add_argument(
'--data-format',
type=str,
default=None,
help="""\
If not set, the data format best for the training device is used.
Allowed values: channels_first (NCHW) channels_last (NHWC).\
""")
parser.add_argument(
'--log-device-placement',
action='store_true',
default=False,
help='Whether to log device placement.')
parser.add_argument(
'--batch-norm-decay',
type=float,
default=0.997,
help='Decay for batch norm.')
parser.add_argument(
'--batch-norm-epsilon',
type=float,
default=1e-5,
help='Epsilon for batch norm.')
args = parser.parse_args()
if args.num_gpus > 0:
assert tf.test.is_gpu_available(), "Requested GPUs but none found."
if args.num_gpus < 0:
raise ValueError(
'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
if args.num_gpus == 0 and args.variable_strategy == 'GPU':
raise ValueError('num-gpus=0, CPU must be used as parameter server. Set'
'--variable-strategy=CPU.')
if (args.num_layers - 2) % 6 != 0:
raise ValueError('Invalid --num-layers parameter.')
if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
raise ValueError('--train-batch-size must be multiple of --num-gpus.')
if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
raise ValueError('--eval-batch-size must be multiple of --num-gpus.')
main(**vars(args))

View File

@ -0,0 +1,80 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model class for Cifar10 Dataset."""
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import model_base
class ResNetCifar10(model_base.ResNet):
"""Cifar10 model with ResNetV1 and basic residual block."""
def __init__(self,
num_layers,
is_training,
batch_norm_decay,
batch_norm_epsilon,
data_format='channels_first'):
super(ResNetCifar10, self).__init__(
is_training,
data_format,
batch_norm_decay,
batch_norm_epsilon
)
self.n = (num_layers - 2) // 6
# Add one in case label starts with 1. No impact if label starts with 0.
self.num_classes = 10 + 1
self.filters = [16, 16, 32, 64]
self.strides = [1, 2, 2]
def forward_pass(self, x, input_data_format='channels_last'):
"""Build the core model within the graph."""
if self._data_format != input_data_format:
if input_data_format == 'channels_last':
# Computation requires channels_first.
x = tf.transpose(x, [0, 3, 1, 2])
else:
# Computation requires channels_last.
x = tf.transpose(x, [0, 2, 3, 1])
# Image standardization.
x = x / 128 - 1
x = self._conv(x, 3, 16, 1)
x = self._batch_norm(x)
x = self._relu(x)
# Use basic (non-bottleneck) block and ResNet V1 (post-activation).
res_func = self._residual_v1
# 3 stages of block stacking.
for i in range(3):
with tf.name_scope('stage'):
for j in range(self.n):
if j == 0:
# First block in a stage, filters and strides may change.
x = res_func(x, 3, self.filters[i], self.filters[i + 1],
self.strides[i])
else:
# Following blocks in a stage, constant filters and unit stride.
x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1)
x = self._global_avg_pool(x)
x = self._fully_connected(x, self.num_classes)
return x

View File

@ -0,0 +1,154 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import collections
import six
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from tensorflow.core.framework import node_def_pb2
from tensorflow.python.framework import device as pydev
from tensorflow.python.training import basic_session_run_hooks
from tensorflow.python.training import session_run_hook
from tensorflow.python.training import training_util
from tensorflow.python.training import device_setter
from tensorflow.contrib.learn.python.learn import run_config
# TODO(b/64848083) Remove once uid bug is fixed
class RunConfig(tf.contrib.learn.RunConfig):
def uid(self, whitelist=None):
"""Generates a 'Unique Identifier' based on all internal fields.
Caller should use the uid string to check `RunConfig` instance integrity
in one session use, but should not rely on the implementation details, which
is subject to change.
Args:
whitelist: A list of the string names of the properties uid should not
include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
includes most properties user allowes to change.
Returns:
A uid string.
"""
if whitelist is None:
whitelist = run_config._DEFAULT_UID_WHITE_LIST
state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
# Pop out the keys in whitelist.
for k in whitelist:
state.pop('_' + k, None)
ordered_state = collections.OrderedDict(
sorted(state.items(), key=lambda t: t[0]))
# For class instance without __repr__, some special cares are required.
# Otherwise, the object address will be used.
if '_cluster_spec' in ordered_state:
ordered_state['_cluster_spec'] = collections.OrderedDict(
sorted(ordered_state['_cluster_spec'].as_dict().items(),
key=lambda t: t[0])
)
return ', '.join(
'%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
"""Hook to print out examples per second.
Total time is tracked and then divided by the total number of steps
to get the average step time and then batch_size is used to determine
the running average of examples per second. The examples per second for the
most recent interval is also logged.
"""
def __init__(
self,
batch_size,
every_n_steps=100,
every_n_secs=None,):
"""Initializer for ExamplesPerSecondHook.
Args:
batch_size: Total batch size used to calculate examples/second from
global time.
every_n_steps: Log stats every n steps.
every_n_secs: Log stats every n seconds.
"""
if (every_n_steps is None) == (every_n_secs is None):
raise ValueError('exactly one of every_n_steps'
' and every_n_secs should be provided.')
self._timer = basic_session_run_hooks.SecondOrStepTimer(
every_steps=every_n_steps, every_secs=every_n_secs)
self._step_train_time = 0
self._total_steps = 0
self._batch_size = batch_size
def begin(self):
self._global_step_tensor = training_util.get_global_step()
if self._global_step_tensor is None:
raise RuntimeError(
'Global step should be created to use StepCounterHook.')
def before_run(self, run_context): # pylint: disable=unused-argument
return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
def after_run(self, run_context, run_values):
_ = run_context
global_step = run_values.results
if self._timer.should_trigger_for_step(global_step):
elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
global_step)
if elapsed_time is not None:
steps_per_sec = elapsed_steps / elapsed_time
self._step_train_time += elapsed_time
self._total_steps += elapsed_steps
average_examples_per_sec = self._batch_size * (
self._total_steps / self._step_train_time)
current_examples_per_sec = steps_per_sec * self._batch_size
# Average examples/sec followed by current examples/sec
logging.info('%s: %g (%g), step = %g', 'Average examples/sec',
average_examples_per_sec, current_examples_per_sec,
self._total_steps)
def local_device_setter(num_devices=1,
ps_device_type='cpu',
worker_device='/cpu:0',
ps_ops=None,
ps_strategy=None):
if ps_ops == None:
ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
if ps_strategy is None:
ps_strategy = device_setter._RoundRobinStrategy(num_devices)
if not six.callable(ps_strategy):
raise TypeError("ps_strategy must be callable")
def _local_device_chooser(op):
current_device = pydev.DeviceSpec.from_string(op.device or "")
node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
if node_def.op in ps_ops:
ps_device_spec = pydev.DeviceSpec.from_string(
'/{}:{}'.format(ps_device_type, ps_strategy(op)))
ps_device_spec.merge_from(current_device)
return ps_device_spec.to_string()
else:
worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
worker_device_spec.merge_from(current_device)
return worker_device_spec.to_string()
return _local_device_chooser

View File

@ -0,0 +1,114 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
Generates tf.train.Example protos and writes them to TFRecord files from the
python version of the CIFAR-10 dataset downloaded from
https://www.cs.toronto.edu/~kriz/cifar.html.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import tarfile
from six.moves import cPickle as pickle
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
CIFAR_FILENAME = 'cifar-10-python.tar.gz'
CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
def download_and_extract(data_dir):
# download CIFAR-10 if not already downloaded.
tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
CIFAR_DOWNLOAD_URL)
tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
'r:gz').extractall(data_dir)
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _get_file_names():
"""Returns the file names expected to exist in the input_dir."""
file_names = {}
file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
file_names['validation'] = ['data_batch_5']
file_names['eval'] = ['test_batch']
return file_names
def read_pickle_from_file(filename):
with tf.gfile.Open(filename, 'rb') as f:
data_dict = pickle.load(f)
return data_dict
def convert_to_tfrecord(input_files, output_file):
"""Converts a file to TFRecords."""
print('Generating %s' % output_file)
with tf.python_io.TFRecordWriter(output_file) as record_writer:
for input_file in input_files:
data_dict = read_pickle_from_file(input_file)
data = data_dict['data']
labels = data_dict['labels']
num_entries_in_batch = len(labels)
for i in range(num_entries_in_batch):
example = tf.train.Example(features=tf.train.Features(
feature={
'image': _bytes_feature(data[i].tobytes()),
'label': _int64_feature(labels[i])
}))
record_writer.write(example.SerializeToString())
def main(data_dir):
print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
download_and_extract(data_dir)
file_names = _get_file_names()
input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
for mode, files in file_names.items():
input_files = [os.path.join(input_dir, f) for f in files]
output_file = os.path.join(data_dir, mode + '.tfrecords')
try:
os.remove(output_file)
except OSError:
pass
# Convert to tf.train.Example and write the to TFRecords.
convert_to_tfrecord(input_files, output_file)
print('Done!')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data-dir',
type=str,
default='',
help='Directory to download and extract CIFAR-10 to.')
args = parser.parse_args()
main(args.data_dir)

View File

@ -0,0 +1,219 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet model.
Related papers:
https://arxiv.org/pdf/1603.05027v2.pdf
https://arxiv.org/pdf/1512.03385v1.pdf
https://arxiv.org/pdf/1605.07146v1.pdf
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class ResNet(object):
"""ResNet model."""
def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
"""ResNet constructor.
Args:
is_training: if build training or inference model.
data_format: the data_format used during computation.
one of 'channels_first' or 'channels_last'.
"""
self._batch_norm_decay = batch_norm_decay
self._batch_norm_epsilon = batch_norm_epsilon
self._is_training = is_training
assert data_format in ('channels_first', 'channels_last')
self._data_format = data_format
def forward_pass(self, x):
raise NotImplementedError(
'forward_pass() is implemented in ResNet sub classes')
def _residual_v1(self,
x,
kernel_size,
in_filter,
out_filter,
stride,
activate_before_residual=False):
"""Residual unit with 2 sub layers, using Plan A for shortcut connection."""
del activate_before_residual
with tf.name_scope('residual_v1') as name_scope:
orig_x = x
x = self._conv(x, kernel_size, out_filter, stride)
x = self._batch_norm(x)
x = self._relu(x)
x = self._conv(x, kernel_size, out_filter, 1)
x = self._batch_norm(x)
if in_filter != out_filter:
orig_x = self._avg_pool(orig_x, stride, stride)
pad = (out_filter - in_filter) // 2
if self._data_format == 'channels_first':
orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]])
else:
orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]])
x = self._relu(tf.add(x, orig_x))
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
return x
def _residual_v2(self,
x,
in_filter,
out_filter,
stride,
activate_before_residual=False):
"""Residual unit with 2 sub layers with preactivation, plan A shortcut."""
with tf.name_scope('residual_v2') as name_scope:
if activate_before_residual:
x = self._batch_norm(x)
x = self._relu(x)
orig_x = x
else:
orig_x = x
x = self._batch_norm(x)
x = self._relu(x)
x = self._conv(x, 3, out_filter, stride)
x = self._batch_norm(x)
x = self._relu(x)
x = self._conv(x, 3, out_filter, [1, 1, 1, 1])
if in_filter != out_filter:
pad = (out_filter - in_filter) // 2
orig_x = self._avg_pool(orig_x, stride, stride)
if self._data_format == 'channels_first':
orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]])
else:
orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]])
x = tf.add(x, orig_x)
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
return x
def _bottleneck_residual_v2(self,
x,
in_filter,
out_filter,
stride,
activate_before_residual=False):
"""Bottleneck residual unit with 3 sub layers, plan B shortcut."""
with tf.name_scope('bottle_residual_v2') as name_scope:
if activate_before_residual:
x = self._batch_norm(x)
x = self._relu(x)
orig_x = x
else:
orig_x = x
x = self._batch_norm(x)
x = self._relu(x)
x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True)
x = self._batch_norm(x)
x = self._relu(x)
# pad when stride isn't unit
x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True)
x = self._batch_norm(x)
x = self._relu(x)
x = self._conv(x, 1, out_filter, 1, is_atrous=True)
if in_filter != out_filter:
orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True)
x = tf.add(x, orig_x)
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
return x
def _conv(self, x, kernel_size, filters, strides, is_atrous=False):
"""Convolution."""
padding = 'SAME'
if not is_atrous and strides > 1:
pad = kernel_size - 1
pad_beg = pad // 2
pad_end = pad - pad_beg
if self._data_format == 'channels_first':
x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
else:
x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
padding = 'VALID'
return tf.layers.conv2d(
inputs=x,
kernel_size=kernel_size,
filters=filters,
strides=strides,
padding=padding,
use_bias=False,
data_format=self._data_format)
def _batch_norm(self, x):
if self._data_format == 'channels_first':
data_format = 'NCHW'
else:
data_format = 'NHWC'
return tf.contrib.layers.batch_norm(
x,
decay=self._batch_norm_decay,
center=True,
scale=True,
epsilon=self._batch_norm_epsilon,
is_training=self._is_training,
fused=True,
data_format=data_format)
def _relu(self, x):
return tf.nn.relu(x)
def _fully_connected(self, x, out_dim):
with tf.name_scope('fully_connected') as name_scope:
x = tf.layers.dense(x, out_dim)
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
return x
def _avg_pool(self, x, pool_size, stride):
with tf.name_scope('avg_pool') as name_scope:
x = tf.layers.average_pooling2d(
x, pool_size, stride, 'SAME', data_format=self._data_format)
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
return x
def _global_avg_pool(self, x):
with tf.name_scope('global_avg_pool') as name_scope:
assert x.get_shape().ndims == 4
if self._data_format == 'channels_first':
x = tf.reduce_mean(x, [2, 3])
else:
x = tf.reduce_mean(x, [1, 2])
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
return x

View File

@ -0,0 +1,75 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:9.0-base-ubuntu16.04
RUN echo "$LOG_TAG update and install basic packages" && \
apt-get -y update && apt-get install -y --no-install-recommends \
build-essential \
curl \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
rsync \
software-properties-common \
unzip \
vim \
wget \
&& \
apt-get install -y locales && \
locale-gen $LANG && \
apt-get clean && \
apt -y autoclean && \
apt -y dist-upgrade && \
apt-get install -y build-essential && \
rm -rf /var/lib/apt/lists/*
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN echo "$LOG_TAG Install java8" && \
apt-get -y update && \
apt-get install -y openjdk-8-jdk && \
rm -rf /var/lib/apt/lists/*
# Install Zeppelin
ENV Z_VERSION="0.7.3" \
Z_HOME="/zeppelin"
RUN echo "$LOG_TAG Download Zeppelin binary" && \
wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
mv /zeppelin-${Z_VERSION}-bin-all ${Z_HOME}
ENV PATH="${Z_HOME}/bin:${PATH}"
RUN echo "$LOG_TAG Set locale" && \
echo "LC_ALL=en_US.UTF-8" >> /etc/environment && \
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen && \
echo "LANG=en_US.UTF-8" > /etc/locale.conf && \
locale-gen en_US.UTF-8
ENV LANG=en_US.UTF-8 \
LC_ALL=en_US.UTF-8
COPY zeppelin-site.xml $Z_HOME/conf/zeppelin-site.xml
COPY shiro.ini ${Z_HOME}/conf/shiro.ini
RUN chmod 777 -R ${Z_HOME}
COPY run_container.sh /usr/local/bin/run_container.sh
RUN chmod 755 /usr/local/bin/run_container.sh
EXPOSE 8080
CMD ["/usr/local/bin/run_container.sh"]

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"${Z_HOME}/bin/zeppelin-daemon.sh" start
while true; do
#perform the test
sleep 5
done

View File

@ -0,0 +1,120 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
[users]
# List of users with their password allowed to access Zeppelin.
# To use a different strategy (LDAP / Database / ...) check the shiro doc at http://shiro.apache.org/configuration.html#Configuration-INISections
# To enable admin user, uncomment the following line and set an appropriate password.
admin = admin, admin
user1 = password2, role1, role2
user2 = password3, role3
user3 = password4, role2
# Sample LDAP configuration, for user Authentication, currently tested for single Realm
[main]
### A sample for configuring Active Directory Realm
#activeDirectoryRealm = org.apache.zeppelin.realm.ActiveDirectoryGroupRealm
#activeDirectoryRealm.systemUsername = userNameA
#use either systemPassword or hadoopSecurityCredentialPath, more details in http://zeppelin.apache.org/docs/latest/security/shiroauthentication.html
#activeDirectoryRealm.systemPassword = passwordA
#activeDirectoryRealm.hadoopSecurityCredentialPath = jceks://file/user/zeppelin/zeppelin.jceks
#activeDirectoryRealm.searchBase = CN=Users,DC=SOME_GROUP,DC=COMPANY,DC=COM
#activeDirectoryRealm.url = ldap://ldap.test.com:389
#activeDirectoryRealm.groupRolesMap = "CN=admin,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"admin","CN=finance,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"finance","CN=hr,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"hr"
#activeDirectoryRealm.authorizationCachingEnabled = false
### A sample for configuring LDAP Directory Realm
#ldapRealm = org.apache.zeppelin.realm.LdapGroupRealm
## search base for ldap groups (only relevant for LdapGroupRealm):
#ldapRealm.contextFactory.environment[ldap.searchBase] = dc=COMPANY,dc=COM
#ldapRealm.contextFactory.url = ldap://ldap.test.com:389
#ldapRealm.userDnTemplate = uid={0},ou=Users,dc=COMPANY,dc=COM
#ldapRealm.contextFactory.authenticationMechanism = simple
### A sample PAM configuration
#pamRealm=org.apache.zeppelin.realm.PamRealm
#pamRealm.service=sshd
### A sample for configuring ZeppelinHub Realm
#zeppelinHubRealm = org.apache.zeppelin.realm.ZeppelinHubRealm
## Url of ZeppelinHub
#zeppelinHubRealm.zeppelinhubUrl = https://www.zeppelinhub.com
#securityManager.realms = $zeppelinHubRealm
## A same for configuring Knox SSO Realm
#knoxJwtRealm = org.apache.zeppelin.realm.jwt.KnoxJwtRealm
#knoxJwtRealm.providerUrl = https://domain.example.com/
#knoxJwtRealm.login = gateway/knoxsso/knoxauth/login.html
#knoxJwtRealm.logout = gateway/knoxssout/api/v1/webssout
#knoxJwtRealm.logoutAPI = true
#knoxJwtRealm.redirectParam = originalUrl
#knoxJwtRealm.cookieName = hadoop-jwt
#knoxJwtRealm.publicKeyPath = /etc/zeppelin/conf/knox-sso.pem
#
#knoxJwtRealm.groupPrincipalMapping = group.principal.mapping
#knoxJwtRealm.principalMapping = principal.mapping
#authc = org.apache.zeppelin.realm.jwt.KnoxAuthenticationFilter
sessionManager = org.apache.shiro.web.session.mgt.DefaultWebSessionManager
### If caching of user is required then uncomment below lines
#cacheManager = org.apache.shiro.cache.MemoryConstrainedCacheManager
#securityManager.cacheManager = $cacheManager
### Enables 'HttpOnly' flag in Zeppelin cookies
cookie = org.apache.shiro.web.servlet.SimpleCookie
cookie.name = JSESSIONID
cookie.httpOnly = true
### Uncomment the below line only when Zeppelin is running over HTTPS
#cookie.secure = true
sessionManager.sessionIdCookie = $cookie
securityManager.sessionManager = $sessionManager
# 86,400,000 milliseconds = 24 hour
securityManager.sessionManager.globalSessionTimeout = 86400000
shiro.loginUrl = /api/login
[roles]
role1 = *
role2 = *
role3 = *
admin = *
[urls]
# This section is used for url-based security. For details see the shiro.ini documentation.
#
# You can secure interpreter, configuration and credential information by urls.
# Comment or uncomment the below urls that you want to hide:
# anon means the access is anonymous.
# authc means form based auth Security.
#
# IMPORTANT: Order matters: URL path expressions are evaluated against an incoming request
# in the order they are defined and the FIRST MATCH WINS.
#
# To allow anonymous access to all but the stated urls,
# uncomment the line second last line (/** = anon) and comment the last line (/** = authc)
#
/api/version = anon
# Allow all authenticated users to restart interpreters on a notebook page.
# Comment out the following line if you would like to authorize only admin users to restart interpreters.
/api/interpreter/setting/restart/** = authc
/api/interpreter/** = authc, roles[admin]
/api/configurations/** = authc, roles[admin]
/api/credential/** = authc, roles[admin]
#/** = anon
/** = authc

View File

@ -0,0 +1,569 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>zeppelin.server.addr</name>
<value>0.0.0.0</value>
<description>Server address</description>
</property>
<property>
<name>zeppelin.server.port</name>
<value>8080</value>
<description>Server port.</description>
</property>
<property>
<name>zeppelin.server.ssl.port</name>
<value>8443</value>
<description>Server ssl port. (used when ssl property is set to true)</description>
</property>
<property>
<name>zeppelin.server.context.path</name>
<value>/</value>
<description>Context Path of the Web Application</description>
</property>
<property>
<name>zeppelin.war.tempdir</name>
<value>webapps</value>
<description>Location of jetty temporary directory</description>
</property>
<property>
<name>zeppelin.notebook.dir</name>
<value>notebook</value>
<description>path or URI for notebook persist</description>
</property>
<property>
<name>zeppelin.notebook.homescreen</name>
<value></value>
<description>id of notebook to be displayed in homescreen. ex) 2A94M5J1Z Empty value displays default home screen</description>
</property>
<property>
<name>zeppelin.notebook.homescreen.hide</name>
<value>false</value>
<description>hide homescreen notebook from list when this value set to true</description>
</property>
<property>
<name>zeppelin.notebook.collaborative.mode.enable</name>
<value>true</value>
<description>Enable collaborative mode</description>
</property>
<!-- Google Cloud Storage notebook storage -->
<!--
<property>
<name>zeppelin.notebook.gcs.dir</name>
<value></value>
<description>
A GCS path in the form gs://bucketname/path/to/dir.
Notes are stored at {zeppelin.notebook.gcs.dir}/{notebook-id}/note.json
</description>
</property>
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.GCSNotebookRepo</value>
<description>notebook persistence layer implementation</description>
</property>
-->
<!-- Amazon S3 notebook storage -->
<!-- Creates the following directory structure: s3://{bucket}/{username}/{notebook-id}/note.json -->
<!--
<property>
<name>zeppelin.notebook.s3.user</name>
<value>user</value>
<description>user name for s3 folder structure</description>
</property>
<property>
<name>zeppelin.notebook.s3.bucket</name>
<value>zeppelin</value>
<description>bucket name for notebook storage</description>
</property>
<property>
<name>zeppelin.notebook.s3.endpoint</name>
<value>s3.amazonaws.com</value>
<description>endpoint for s3 bucket</description>
</property>
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.S3NotebookRepo</value>
<description>notebook persistence layer implementation</description>
</property>
-->
<!-- Additionally, encryption is supported for notebook data stored in S3 -->
<!-- Use the AWS KMS to encrypt data -->
<!-- If used, the EC2 role assigned to the EMR cluster must have rights to use the given key -->
<!-- See https://aws.amazon.com/kms/ and http://docs.aws.amazon.com/kms/latest/developerguide/concepts.html -->
<!--
<property>
<name>zeppelin.notebook.s3.kmsKeyID</name>
<value>AWS-KMS-Key-UUID</value>
<description>AWS KMS key ID used to encrypt notebook data in S3</description>
</property>
-->
<!-- provide region of your KMS key -->
<!-- See http://docs.aws.amazon.com/general/latest/gr/rande.html#kms_region for region codes names -->
<!--
<property>
<name>zeppelin.notebook.s3.kmsKeyRegion</name>
<value>us-east-1</value>
<description>AWS KMS key region in your AWS account</description>
</property>
-->
<!-- Use a custom encryption materials provider to encrypt data -->
<!-- No configuration is given to the provider, so you must use system properties or another means to configure -->
<!-- See https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/EncryptionMaterialsProvider.html -->
<!--
<property>
<name>zeppelin.notebook.s3.encryptionMaterialsProvider</name>
<value>provider implementation class name</value>
<description>Custom encryption materials provider used to encrypt notebook data in S3</description>
</property>
-->
<!-- Server-side encryption enabled for notebooks -->
<!--
<property>
<name>zeppelin.notebook.s3.sse</name>
<value>true</value>
<description>Server-side encryption enabled for notebooks</description>
</property>
-->
<!-- Optional override to control which signature algorithm should be used to sign AWS requests -->
<!-- Set this property to "S3SignerType" if your AWS S3 compatible APIs support only AWS Signature Version 2 such as Ceph. -->
<!--
<property>
<name>zeppelin.notebook.s3.signerOverride</name>
<value>S3SignerType</value>
<description>optional override to control which signature algorithm should be used to sign AWS requests</description>
</property>
-->
<!-- If using Azure for storage use the following settings -->
<!--
<property>
<name>zeppelin.notebook.azure.connectionString</name>
<value>DefaultEndpointsProtocol=https;AccountName=<accountName>;AccountKey=<accountKey></value>
<description>Azure account credentials</description>
</property>
<property>
<name>zeppelin.notebook.azure.share</name>
<value>zeppelin</value>
<description>share name for notebook storage</description>
</property>
<property>
<name>zeppelin.notebook.azure.user</name>
<value>user</value>
<description>optional user name for Azure folder structure</description>
</property>
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.AzureNotebookRepo</value>
<description>notebook persistence layer implementation</description>
</property>
-->
<!-- Notebook storage layer using local file system
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.VFSNotebookRepo</value>
<description>local notebook persistence layer implementation</description>
</property>
-->
<!-- Notebook storage layer using hadoop compatible file system
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.FileSystemNotebookRepo</value>
<description>Hadoop compatible file system notebook persistence layer implementation, such as local file system, hdfs, azure wasb, s3 and etc.</description>
</property>
<property>
<name>zeppelin.server.kerberos.keytab</name>
<value></value>
<description>keytab for accessing kerberized hdfs</description>
</property>
<property>
<name>zeppelin.server.kerberos.principal</name>
<value></value>
<description>principal for accessing kerberized hdfs</description>
</property>
-->
<!-- For connecting your Zeppelin with ZeppelinHub -->
<!--
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.GitNotebookRepo, org.apache.zeppelin.notebook.repo.zeppelinhub.ZeppelinHubRepo</value>
<description>two notebook persistence layers (versioned local + ZeppelinHub)</description>
</property>
-->
<!-- MongoDB notebook storage -->
<!--
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.MongoNotebookRepo</value>
<description>notebook persistence layer implementation</description>
</property>
<property>
<name>zeppelin.notebook.mongo.uri</name>
<value>mongodb://localhost</value>
<description>MongoDB connection URI used to connect to a MongoDB database server</description>
</property>
<property>
<name>zeppelin.notebook.mongo.database</name>
<value>zeppelin</value>
<description>database name for notebook storage</description>
</property>
<property>
<name>zeppelin.notebook.mongo.collection</name>
<value>notes</value>
<description>collection name for notebook storage</description>
</property>
<property>
<name>zeppelin.notebook.mongo.autoimport</name>
<value>false</value>
<description>import local notes into MongoDB automatically on startup</description>
</property>
-->
<property>
<name>zeppelin.notebook.storage</name>
<value>org.apache.zeppelin.notebook.repo.GitNotebookRepo</value>
<description>versioned notebook persistence layer implementation</description>
</property>
<property>
<name>zeppelin.notebook.one.way.sync</name>
<value>false</value>
<description>If there are multiple notebook storages, should we treat the first one as the only source of truth?</description>
</property>
<property>
<name>zeppelin.interpreter.dir</name>
<value>interpreter</value>
<description>Interpreter implementation base directory</description>
</property>
<property>
<name>zeppelin.interpreter.localRepo</name>
<value>local-repo</value>
<description>Local repository for interpreter's additional dependency loading</description>
</property>
<property>
<name>zeppelin.interpreter.dep.mvnRepo</name>
<value>http://repo1.maven.org/maven2/</value>
<description>Remote principal repository for interpreter's additional dependency loading</description>
</property>
<property>
<name>zeppelin.dep.localrepo</name>
<value>local-repo</value>
<description>Local repository for dependency loader</description>
</property>
<property>
<name>zeppelin.helium.node.installer.url</name>
<value>https://nodejs.org/dist/</value>
<description>Remote Node installer url for Helium dependency loader</description>
</property>
<property>
<name>zeppelin.helium.npm.installer.url</name>
<value>http://registry.npmjs.org/</value>
<description>Remote Npm installer url for Helium dependency loader</description>
</property>
<property>
<name>zeppelin.helium.yarnpkg.installer.url</name>
<value>https://github.com/yarnpkg/yarn/releases/download/</value>
<description>Remote Yarn package installer url for Helium dependency loader</description>
</property>
<property>
<name>zeppelin.interpreters</name>
<value>org.apache.zeppelin.spark.SparkInterpreter,org.apache.zeppelin.spark.PySparkInterpreter,org.apache.zeppelin.rinterpreter.RRepl,org.apache.zeppelin.rinterpreter.KnitR,org.apache.zeppelin.spark.SparkRInterpreter,org.apache.zeppelin.spark.SparkSqlInterpreter,org.apache.zeppelin.spark.DepInterpreter,org.apache.zeppelin.markdown.Markdown,org.apache.zeppelin.angular.AngularInterpreter,org.apache.zeppelin.shell.ShellInterpreter,org.apache.zeppelin.file.HDFSFileInterpreter,org.apache.zeppelin.flink.FlinkInterpreter,,org.apache.zeppelin.python.PythonInterpreter,org.apache.zeppelin.python.PythonInterpreterPandasSql,org.apache.zeppelin.python.PythonCondaInterpreter,org.apache.zeppelin.python.PythonDockerInterpreter,org.apache.zeppelin.lens.LensInterpreter,org.apache.zeppelin.ignite.IgniteInterpreter,org.apache.zeppelin.ignite.IgniteSqlInterpreter,org.apache.zeppelin.cassandra.CassandraInterpreter,org.apache.zeppelin.geode.GeodeOqlInterpreter,org.apache.zeppelin.jdbc.JDBCInterpreter,org.apache.zeppelin.kylin.KylinInterpreter,org.apache.zeppelin.elasticsearch.ElasticsearchInterpreter,org.apache.zeppelin.scalding.ScaldingInterpreter,org.apache.zeppelin.alluxio.AlluxioInterpreter,org.apache.zeppelin.hbase.HbaseInterpreter,org.apache.zeppelin.livy.LivySparkInterpreter,org.apache.zeppelin.livy.LivyPySparkInterpreter,org.apache.zeppelin.livy.LivyPySpark3Interpreter,org.apache.zeppelin.livy.LivySparkRInterpreter,org.apache.zeppelin.livy.LivySparkSQLInterpreter,org.apache.zeppelin.bigquery.BigQueryInterpreter,org.apache.zeppelin.beam.BeamInterpreter,org.apache.zeppelin.pig.PigInterpreter,org.apache.zeppelin.pig.PigQueryInterpreter,org.apache.zeppelin.scio.ScioInterpreter,org.apache.zeppelin.groovy.GroovyInterpreter</value>
<description>Comma separated interpreter configurations. First interpreter become a default</description>
</property>
<property>
<name>zeppelin.interpreter.group.order</name>
<value>spark,md,angular,sh,livy,alluxio,file,psql,flink,python,ignite,lens,cassandra,geode,kylin,elasticsearch,scalding,jdbc,hbase,bigquery,beam,groovy</value>
<description></description>
</property>
<property>
<name>zeppelin.interpreter.connect.timeout</name>
<value>30000</value>
<description>Interpreter process connect timeout in msec.</description>
</property>
<property>
<name>zeppelin.interpreter.output.limit</name>
<value>102400</value>
<description>Output message from interpreter exceeding the limit will be truncated</description>
</property>
<property>
<name>zeppelin.ssl</name>
<value>false</value>
<description>Should SSL be used by the servers?</description>
</property>
<property>
<name>zeppelin.ssl.client.auth</name>
<value>false</value>
<description>Should client authentication be used for SSL connections?</description>
</property>
<property>
<name>zeppelin.ssl.keystore.path</name>
<value>keystore</value>
<description>Path to keystore relative to Zeppelin configuration directory</description>
</property>
<property>
<name>zeppelin.ssl.keystore.type</name>
<value>JKS</value>
<description>The format of the given keystore (e.g. JKS or PKCS12)</description>
</property>
<property>
<name>zeppelin.ssl.keystore.password</name>
<value>change me</value>
<description>Keystore password. Can be obfuscated by the Jetty Password tool</description>
</property>
<!--
<property>
<name>zeppelin.ssl.key.manager.password</name>
<value>change me</value>
<description>Key Manager password. Defaults to keystore password. Can be obfuscated.</description>
</property>
-->
<property>
<name>zeppelin.ssl.truststore.path</name>
<value>truststore</value>
<description>Path to truststore relative to Zeppelin configuration directory. Defaults to the keystore path</description>
</property>
<property>
<name>zeppelin.ssl.truststore.type</name>
<value>JKS</value>
<description>The format of the given truststore (e.g. JKS or PKCS12). Defaults to the same type as the keystore type</description>
</property>
<!--
<property>
<name>zeppelin.ssl.truststore.password</name>
<value>change me</value>
<description>Truststore password. Can be obfuscated by the Jetty Password tool. Defaults to the keystore password</description>
</property>
-->
<property>
<name>zeppelin.server.allowed.origins</name>
<value>*</value>
<description>Allowed sources for REST and WebSocket requests (i.e. http://onehost:8080,http://otherhost.com). If you leave * you are vulnerable to https://issues.apache.org/jira/browse/ZEPPELIN-173</description>
</property>
<property>
<name>zeppelin.anonymous.allowed</name>
<value>false</value>
<description>Anonymous user allowed by default</description>
</property>
<property>
<name>zeppelin.username.force.lowercase</name>
<value>false</value>
<description>Force convert username case to lower case, useful for Active Directory/LDAP. Default is not to change case</description>
</property>
<property>
<name>zeppelin.notebook.default.owner.username</name>
<value></value>
<description>Set owner role by default</description>
</property>
<property>
<name>zeppelin.notebook.public</name>
<value>true</value>
<description>Make notebook public by default when created, private otherwise</description>
</property>
<property>
<name>zeppelin.websocket.max.text.message.size</name>
<value>1024000</value>
<description>Size in characters of the maximum text message to be received by websocket. Defaults to 1024000</description>
</property>
<property>
<name>zeppelin.server.default.dir.allowed</name>
<value>false</value>
<description>Enable directory listings on server.</description>
</property>
<!--
<property>
<name>zeppelin.interpreter.lifecyclemanager.class</name>
<value>org.apache.zeppelin.interpreter.lifecycle.TimeoutLifecycleManager</value>
<description>LifecycleManager class for managing the lifecycle of interpreters, by default interpreter will
be closed after timeout</description>
</property>
<property>
<name>zeppelin.interpreter.lifecyclemanager.timeout.checkinterval</name>
<value>60000</value>
<description>Milliseconds of the interval to checking whether interpreter is time out</description>
</property>
<property>
<name>zeppelin.interpreter.lifecyclemanager.timeout.threshold</name>
<value>3600000</value>
<description>Milliseconds of the interpreter timeout threshold, by default it is 1 hour</description>
</property>
-->
<!--
<property>
<name>zeppelin.server.jetty.name</name>
<value>Jetty(7.6.0.v20120127)</value>
<description>Hardcoding Application Server name to Prevent Fingerprinting</description>
</property>
-->
<!--
<property>
<name>zeppelin.server.jetty.request.header.size</name>
<value>8192</value>
<description>Http Request Header Size Limit (to prevent HTTP 413)</description>
</property>
-->
<!--
<property>
<name>zeppelin.server.xframe.options</name>
<value>SAMEORIGIN</value>
<description>The X-Frame-Options HTTP response header can be used to indicate whether or not a browser should be allowed to render a page in a frame/iframe/object.</description>
</property>
-->
<!--
<property>
<name>zeppelin.server.strict.transport</name>
<value>max-age=631138519</value>
<description>The HTTP Strict-Transport-Security response header is a security feature that lets a web site tell browsers that it should only be communicated with using HTTPS, instead of using HTTP. Enable this when Zeppelin is running on HTTPS. Value is in Seconds, the default value is equivalent to 20 years.</description>
</property>
-->
<!--
<property>
<name>zeppelin.server.xxss.protection</name>
<value>1</value>
<description>The HTTP X-XSS-Protection response header is a feature of Internet Explorer, Chrome and Safari that stops pages from loading when they detect reflected cross-site scripting (XSS) attacks. When value is set to 1 and a cross-site scripting attack is detected, the browser will sanitize the page (remove the unsafe parts).</description>
</property>
-->
<!--
<property>
<name>zeppelin.interpreter.callback.portRange</name>
<value>10000:10010</value>
</property>
-->
<!--
<property>
<name>zeppelin.recovery.storage.class</name>
<value>org.apache.zeppelin.interpreter.recovery.FileSystemRecoveryStorage</value>
<description>ReoveryStorage implementation</description>
</property>
-->
<!--
<property>
<name>zeppelin.recovery.dir</name>
<value>recovery</value>
<description>Location where recovery metadata is stored</description>
</property>
-->
<!-- GitHub configurations
<property>
<name>zeppelin.notebook.git.remote.url</name>
<value></value>
<description>remote Git repository URL</description>
</property>
<property>
<name>zeppelin.notebook.git.remote.username</name>
<value>token</value>
<description>remote Git repository username</description>
</property>
<property>
<name>zeppelin.notebook.git.remote.access-token</name>
<value></value>
<description>remote Git repository password</description>
</property>
<property>
<name>zeppelin.notebook.git.remote.origin</name>
<value>origin</value>
<description>Git repository remote</description>
</property>
<property>
<name>zeppelin.notebook.cron.enable</name>
<value>false</value>
<description>Notebook enable cron scheduler feature</description>
</property>
<property>
<name>zeppelin.notebook.cron.folders</name>
<value></value>
<description>Notebook cron folders</description>
</property>
-->
</configuration>

View File

@ -12,9 +12,7 @@
limitations under the License. See accompanying LICENSE file.
-->
# Developper Guide
(Need add more details)
# Developer Guide
By default, submarine uses YARN service framework as runtime. If you want to add your own implementation. You can add a new `RuntimeFactory` implementation and configure following option to `submarine.xml` (which should be placed under same `$HADOOP_CONF_DIR`)

View File

@ -0,0 +1,21 @@
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
# Examples
Here're some examples about Submarine usage.
[Running Distributed CIFAR 10 Tensorflow Job](RunningDistributedCifar10TFJobs.html)
[Running Zeppelin Notebook on YARN](RunningZeppelinOnYARN.html)

View File

@ -0,0 +1,42 @@
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
Submarine is a project which allows infra engineer / data scientist to run *unmodified* Tensorflow programs on YARN.
Goals of Submarine:
- It allows jobs for easy access to data/models in HDFS and other storages.
- Can launch services to serve Tensorflow/MXNet models.
- Support run distributed Tensorflow jobs with simple configs.
- Support run user-specified Docker images.
- Support specify GPU and other resources.
- Support launch tensorboard for training jobs if user specified.
- Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
Click below contents if you want to understand more.
- [QuickStart Guide](QuickStart.html)
- [Examples](Examples.html)
- [How to write Dockerfile for Submarine jobs](WriteDockerfile.html)
- [Developer guide](DeveloperGuide.html)

View File

@ -17,11 +17,13 @@
## Prerequisite
Must:
- Apache Hadoop 3.1.0, YARN service enabled.
- Apache Hadoop 3.1.x, YARN service enabled.
Optional:
- Enable YARN DNS. (When distributed training required.)
- Enable GPU on YARN support. (When GPU-based training required.)
- Enable YARN DNS. (When distributed training is required.)
- Enable GPU on YARN support. (When GPU-based training is required.)
## Run jobs
@ -81,15 +83,26 @@ yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \
--input_path hdfs://default/dataset/cifar-10-data \
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
--worker_resources memory=4G,vcores=2,gpu=2 \
--worker_launch_cmd "python ... (Your training application cmd)"
--worker_launch_cmd "python ... (Your training application cmd)" \
--tensorboard # this will launch a companion tensorboard container for monitoring
```
#### Notes:
1) `DOCKER_JAVA_HOME` points to JAVA_HOME inside Docker image.
2) `DOCKER_HADOOP_HDFS_HOME` points to HADOOP_HDFS_HOME inside Docker image.
3) `--worker_resources` can include gpu when you need GPU to train your task.
4) When `--tensorboard` is specified, you can go to YARN new UI, go to services -> `<you specified service>` -> Click `...` to access Tensorboard.
This will launch a Tensorboard to monitor *all your jobs*. By access YARN UI (the new UI). You can go to services page, go to the `tensorboard-service`, click quick links (`Tensorboard`) can lead you to the tensorboard.
See below screenshot:
![alt text](./images/tensorboard-service.png "Tensorboard service")
### Launch Distributed Tensorflow Application:
#### Commandline
@ -110,12 +123,14 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
#### Notes:
1) Very similar to standalone TF application, but you need to specify #worker/#ps
2) Different resources can be specified for worker and PS.
3) `TF_CONFIG` environment will be auto generated and set before executing user's launch command.
## Run jobs
## Get job history / logs
### Get Job Status
### Get Job Status from CLI
```
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
@ -131,4 +146,29 @@ Job Meta Info:
(... all your commandline before run the job)
```
After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job.
After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job.
### Run tensorboard to monitor your jobs
```
# Cleanup previous service if needed
yarn app -destroy tensorboard-service; \
yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
--num_workers 0 --tensorboard
```
You can view multiple job training history like from the `Tensorboard` link:
![alt text](./images/multiple-tensorboard-jobs.png "Tensorboard for multiple jobs")
### Get component logs from a training job
There're two ways to get training job logs, one is from YARN UI (new or old):
![alt text](./images/job-logs-ui.png "Job logs UI")
Or you can use `yarn logs -applicationId <applicationId>` to get logs from CLI

View File

@ -0,0 +1,162 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Tutorial: Running Distributed Cifar10 Tensorflow Estimator Example.
## Prepare data for training
CIFAR-10 is a common benchmark in machine learning for image recognition. Below example is based on CIFAR-10 dataset.
1) Checkout https://github.com/tensorflow/models/:
```
git clone https://github.com/tensorflow/models/
```
2) Go to `models/tutorials/image/cifar10_estimator`
3) Generate data by using following command: (required Tensorflow installed)
```
python generate_cifar10_tfrecords.py --data-dir=cifar-10-data
```
4) Upload data to HDFS
```
hadoop fs -put cifar-10-data/ /dataset/cifar-10-data
```
**Please note that:**
YARN service doesn't allow multiple services with the same name, so please run following command
```
yarn application -destroy <service-name>
```
to delete services if you want to reuse the same service name.
## Prepare Docker images
Refer to [Write Dockerfile](WriteDockerfile.md) to build a Docker image or use prebuilt one.
## Run Tensorflow jobs
### Run standalone training
```
yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \
--input_path hdfs://default/dataset/cifar-10-data \
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0
--num_workers 1 --worker_resources memory=8G,vcores=2,gpu=1 \
--worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \
--tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3
```
Explanations:
- When access of HDFS is required, the two environments are required to indicate: JAVA_HOME and HDFS_HOME to access libhdfs libraries *inside Docker image*. We will try to eliminate specifying this in the future.
- Docker image for worker and tensorboard can be specified separately. For this case, Tensorboard doesn't need GPU, so we will use cpu Docker image for Tensorboard. (Same for parameter-server in the distributed example below).
### Run distributed training
```
yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \
--input_path hdfs://default/dataset/cifar-10-data \
--env(s) (same as standalone)
--num_workers 2 \
--worker_resources memory=8G,vcores=2,gpu=1 \
--worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \
--ps_docker_image wtan/tf-1.8.0-cpu:0.0.3 \
--num_ps 1 --ps_resources memory=4G,vcores=2,gpu=0 \
--ps_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \
--tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3
```
Explanations:
- `>1` num_workers indicates it is a distributed training.
- Parameters / resources / Docker image of parameter server can be specified separately. For many cases, parameter server doesn't require GPU.
*Outputs of distributed training*
Sample output of master:
```
...
allow_soft_placement: true
, '_tf_random_seed': None, '_task_type': u'master', '_environment': u'cloud', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe77cb15050>, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1.0
}
...
2018-05-06 22:29:14.656022: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> localhost:8000}
2018-05-06 22:29:14.656097: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000}
2018-05-06 22:29:14.656112: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000}
2018-05-06 22:29:14.659359: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
...
INFO:tensorflow:Restoring parameters from hdfs://default/tmp/cifar-10-jobdir/model.ckpt-0
INFO:tensorflow:Evaluation [1/625]
INFO:tensorflow:Evaluation [2/625]
INFO:tensorflow:Evaluation [3/625]
INFO:tensorflow:Evaluation [4/625]
INFO:tensorflow:Evaluation [5/625]
INFO:tensorflow:Evaluation [6/625]
...
INFO:tensorflow:Validation (step 1): loss = 1220.6445, global_step = 1, accuracy = 0.1
INFO:tensorflow:loss = 6.3980675, step = 0
INFO:tensorflow:loss = 6.3980675, learning_rate = 0.1
INFO:tensorflow:global_step/sec: 2.34092
INFO:tensorflow:Average examples/sec: 1931.22 (1931.22), step = 100
INFO:tensorflow:Average examples/sec: 354.236 (38.6479), step = 110
INFO:tensorflow:Average examples/sec: 211.096 (38.7693), step = 120
INFO:tensorflow:Average examples/sec: 156.533 (38.1633), step = 130
INFO:tensorflow:Average examples/sec: 128.6 (38.7372), step = 140
INFO:tensorflow:Average examples/sec: 111.533 (39.0239), step = 150
```
Sample output of worker:
```
, '_tf_random_seed': None, '_task_type': u'worker', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc2a490b050>, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1.0
}
...
2018-05-06 22:28:45.807936: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000}
2018-05-06 22:28:45.808040: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000}
2018-05-06 22:28:45.808064: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:8000}
2018-05-06 22:28:45.809919: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
...
INFO:tensorflow:loss = 5.319096, step = 0
INFO:tensorflow:loss = 5.319096, learning_rate = 0.1
INFO:tensorflow:Average examples/sec: 49.2338 (49.2338), step = 10
INFO:tensorflow:Average examples/sec: 52.117 (55.3589), step = 20
INFO:tensorflow:Average examples/sec: 53.2754 (55.7541), step = 30
INFO:tensorflow:Average examples/sec: 53.8388 (55.6028), step = 40
INFO:tensorflow:Average examples/sec: 54.1082 (55.2134), step = 50
INFO:tensorflow:Average examples/sec: 54.3141 (55.3676), step = 60
```
Sample output of ps:
```
...
, '_tf_random_seed': None, '_task_type': u'ps', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4be54dff90>, '_tf_config': gpu_options {
per_process_gpu_memory_fraction: 1.0
}
...
2018-05-06 22:28:42.562316: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000}
2018-05-06 22:28:42.562408: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
2018-05-06 22:28:42.562433: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000}
2018-05-06 22:28:42.564242: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
```

View File

@ -0,0 +1,37 @@
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
# Running Zeppelin Notebook On Submarine
This is a simple example about how to run Zeppelin notebook by using Submarine.
## Step 1: Build Docker Image
Go to `src/main/docker/zeppelin-notebook-example`, build the Docker image. Or you can use the prebuilt one: `hadoopsubmarine/zeppelin-on-yarn-gpu:0.0.1`
## Step 2: Launch the notebook on YARN
Submit command to YARN:
`yarn app -destroy zeppelin-notebook;
yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
job run --name zeppelin-notebook \
--docker_image hadoopsubmarine/zeppelin-on-yarn-gpu:0.0.1 \
--worker_resources memory=8G,vcores=2,gpu=1 \
--num_workers 1 \
-worker_launch_cmd "/usr/local/bin/run_container.sh"`
Once the container got launched, you can go to `YARN services` UI page, access the `zeppelin-notebook` job, and go to the quicklink `notebook` by clicking `...`.
The notebook is secured by admin/admin user name and password.

View File

@ -0,0 +1,117 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Creating Docker Images for Running Tensorflow on YARN
## How to create docker images to run Tensorflow on YARN
Dockerfile to run Tensorflow on YARN need two part:
**Base libraries which Tensorflow depends on**
1) OS base image, for example ```ubuntu:16.04```
2) Tensorflow depended libraries and packages. For example ```python```, ```scipy```. For GPU support, need ```cuda```, ```cudnn```, etc.
3) Tensorflow package.
**Libraries to access HDFS**
1) JDK
2) Hadoop
Here's an example of a base image (w/o GPU support) to install Tensorflow:
```
FROM ubuntu:16.04
# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
python \
python-dev \
rsync \
software-properties-common \
unzip \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
RUN pip --no-cache-dir install \
Pillow \
h5py \
ipykernel \
jupyter \
matplotlib \
numpy \
pandas \
scipy \
sklearn \
&& \
python -m ipykernel.kernelspec
RUN pip --no-cache-dir install \
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
```
On top of above image, add files, install packages to access HDFS
```
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
RUN tar zxf hadoop-3.1.0.tar.gz
```
Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step.
## Use examples to build your own Tensorflow docker images
We provided following examples for you to build tensorflow docker images.
For Tensorflow 1.8.0 (Precompiled to CUDA 9.x)
- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only.
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models
- *docker/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9.
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models.
## Build Docker images
### Manually build Docker image:
Under `docker/` directory, run `build-all.sh` to build Docker images. It will build following images:
- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries.
- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow.
- `tf-1.8.0-gpu:0.0.1` which includes cifar10 model
- `tf-1.8.0-cpu:0.0.1` which inclues cifar10 model (cpu only).
### Use prebuilt images
(No liability)
You can also use prebuilt images for convenience:
- hadoopsubmarine/tf-1.8.0-gpu:0.0.1
- hadoopsubmarine/tf-1.8.0-cpu:0.0.1

View File

@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#banner {
height: 93px;
background: none;
}
#bannerLeft img {
margin-left: 30px;
margin-top: 10px;
}
#bannerRight img {
margin: 17px;
}

View File

@ -0,0 +1,28 @@
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<project name="Apache Hadoop ${project.version}">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-stylus-skin</artifactId>
<version>${maven-stylus-skin.version}</version>
</skin>
<body>
<links>
<item name="Apache Hadoop" href="http://hadoop.apache.org/"/>
</links>
</body>
</project>