YARN-8800. Updated documentation of Submarine with latest examples. Contributed by Wangda Tan.
This commit is contained in:
parent
72891fc9be
commit
19ad5be651
@ -180,6 +180,12 @@
|
||||
<item name="System Services" href="hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html"/>
|
||||
</menu>
|
||||
|
||||
<menu name="Submarine" inherit="top">
|
||||
<item name="Index" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Index.html"/>
|
||||
<item name="QuickStart" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/QuickStart.html"/>
|
||||
<item name="Examples" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Examples.html"/>
|
||||
</menu>
|
||||
|
||||
<menu name="Hadoop Compatible File Systems" inherit="top">
|
||||
<item name="Aliyun OSS" href="hadoop-aliyun/tools/hadoop-aliyun/index.html"/>
|
||||
<item name="Amazon S3" href="hadoop-aws/tools/hadoop-aws/index.html"/>
|
||||
|
@ -48,6 +48,8 @@ Goals of Submarine:
|
||||
- Support launch tensorboard for training jobs if user specified.
|
||||
- Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
|
||||
|
||||
Please jump to [QuickStart](src/site/QuickStart.md) guide to quickly understand how to use this framework.
|
||||
Please jump to [QuickStart](src/site/markdown/QuickStart.md) guide to quickly understand how to use this framework.
|
||||
|
||||
If you're a developer, please find [Developer](src/site/DeveloperGuide.md) guide for more details.
|
||||
Please jump to [Examples](src/site/markdown/Examples.md) to try other examples like running Distributed Tensorflow Training for CIFAR 10.
|
||||
|
||||
If you're a developer, please find [Developer](src/site/markdown/DeveloperGuide.md) guide for more details.
|
||||
|
@ -0,0 +1,69 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM ubuntu:16.04
|
||||
|
||||
LABEL maintainer="Craig Citro <craigcitro@google.com>"
|
||||
|
||||
# Pick up some TF dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
libfreetype6-dev \
|
||||
libpng12-dev \
|
||||
libzmq3-dev \
|
||||
pkg-config \
|
||||
python \
|
||||
python-dev \
|
||||
rsync \
|
||||
software-properties-common \
|
||||
unzip \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
Pillow \
|
||||
h5py \
|
||||
ipykernel \
|
||||
jupyter \
|
||||
matplotlib \
|
||||
numpy \
|
||||
pandas \
|
||||
scipy \
|
||||
sklearn \
|
||||
&& \
|
||||
python -m ipykernel.kernelspec
|
||||
|
||||
# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
|
||||
# These lines will be edited automatically by parameterized_docker_build.sh. #
|
||||
# COPY _PIP_FILE_ /
|
||||
# RUN pip --no-cache-dir install /_PIP_FILE_
|
||||
# RUN rm -f /_PIP_FILE_
|
||||
|
||||
# Install TensorFlow CPU version from central repo
|
||||
RUN pip --no-cache-dir install \
|
||||
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
|
||||
|
||||
RUN apt-get update && apt-get install git -y
|
||||
|
||||
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
|
||||
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
|
||||
RUN tar zxf hadoop-3.1.1.tar.gz
|
@ -0,0 +1,67 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
|
||||
|
||||
# Pick up some TF dependencies
|
||||
RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
|
||||
build-essential \
|
||||
cuda-command-line-tools-9-0 \
|
||||
cuda-cublas-9-0 \
|
||||
cuda-cufft-9-0 \
|
||||
cuda-curand-9-0 \
|
||||
cuda-cusolver-9-0 \
|
||||
cuda-cusparse-9-0 \
|
||||
curl \
|
||||
libcudnn7=7.0.5.15-1+cuda9.0 \
|
||||
libfreetype6-dev \
|
||||
libpng12-dev \
|
||||
libzmq3-dev \
|
||||
pkg-config \
|
||||
python \
|
||||
python-dev \
|
||||
rsync \
|
||||
software-properties-common \
|
||||
unzip \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
Pillow \
|
||||
h5py \
|
||||
ipykernel \
|
||||
jupyter \
|
||||
matplotlib \
|
||||
numpy \
|
||||
pandas \
|
||||
scipy \
|
||||
sklearn \
|
||||
&& \
|
||||
python -m ipykernel.kernelspec
|
||||
|
||||
# Install TensorFlow GPU version.
|
||||
RUN pip --no-cache-dir install \
|
||||
http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
|
||||
RUN apt-get update && apt-get install git -y
|
||||
|
||||
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
|
||||
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
|
||||
RUN tar zxf hadoop-3.1.0.tar.gz
|
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
echo "Building base images"
|
||||
|
||||
set -e
|
||||
|
||||
cd base/ubuntu-16.04
|
||||
|
||||
docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu-base:0.0.1
|
||||
docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu-base:0.0.1
|
||||
|
||||
echo "Finished building base images"
|
||||
|
||||
cd ../../with-cifar10-models/ubuntu-16.04
|
||||
|
||||
docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu:0.0.1
|
||||
docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu:0.0.1
|
@ -0,0 +1,22 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM tf-1.8.0-cpu-base:0.0.1
|
||||
|
||||
# Include models
|
||||
RUN mkdir /test
|
||||
ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
|
||||
RUN chown -R nobody /test
|
@ -0,0 +1,22 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM tf-1.8.0-gpu-base:0.0.1
|
||||
|
||||
# Include models
|
||||
RUN mkdir /test
|
||||
ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
|
||||
RUN chown -R nobody /test
|
@ -0,0 +1,542 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
(Copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator)
|
||||
|
||||
CIFAR-10 is a common benchmark in machine learning for image recognition.
|
||||
|
||||
http://www.cs.toronto.edu/~kriz/cifar.html
|
||||
|
||||
Code in this directory focuses on how to use TensorFlow Estimators to train and
|
||||
evaluate a CIFAR-10 ResNet model on:
|
||||
|
||||
* A single host with one CPU;
|
||||
* A single host with multiple GPUs;
|
||||
* Multiple hosts with CPU or multiple GPUs;
|
||||
|
||||
Before trying to run the model we highly encourage you to read all the README.
|
||||
|
||||
## Prerequisite
|
||||
|
||||
1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.2.1 or
|
||||
later.
|
||||
|
||||
2. Download the CIFAR-10 dataset and generate TFRecord files using the provided
|
||||
script. The script and associated command below will download the CIFAR-10
|
||||
dataset and then generate a TFRecord for the training, validation, and
|
||||
evaluation datasets.
|
||||
|
||||
```shell
|
||||
python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data
|
||||
```
|
||||
|
||||
After running the command above, you should see the following files in the
|
||||
--data-dir (```ls -R cifar-10-data```):
|
||||
|
||||
* train.tfrecords
|
||||
* validation.tfrecords
|
||||
* eval.tfrecords
|
||||
|
||||
|
||||
## Training on a single machine with GPUs or CPU
|
||||
|
||||
Run the training on CPU only. After training, it runs the evaluation.
|
||||
|
||||
```
|
||||
python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
|
||||
--job-dir=/tmp/cifar10 \
|
||||
--num-gpus=0 \
|
||||
--train-steps=1000
|
||||
```
|
||||
|
||||
Run the model on 2 GPUs using CPU as parameter server. After training, it runs
|
||||
the evaluation.
|
||||
```
|
||||
python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
|
||||
--job-dir=/tmp/cifar10 \
|
||||
--num-gpus=2 \
|
||||
--train-steps=1000
|
||||
```
|
||||
|
||||
Run the model on 2 GPUs using GPU as parameter server.
|
||||
It will run an experiment, which for local setting basically means it will run
|
||||
stop training
|
||||
a couple of times to perform evaluation.
|
||||
|
||||
```
|
||||
python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
|
||||
--job-dir=/tmp/cifar10 \
|
||||
--variable-strategy GPU \
|
||||
--num-gpus=2 \
|
||||
```
|
||||
|
||||
There are more command line flags to play with; run
|
||||
`python cifar10_main.py --help` for details.
|
||||
|
||||
## Run distributed training
|
||||
|
||||
### (Optional) Running on Google Cloud Machine Learning Engine
|
||||
|
||||
This example can be run on Google Cloud Machine Learning Engine (ML Engine),
|
||||
which will configure the environment and take care of running workers,
|
||||
parameters servers, and masters in a fault tolerant way.
|
||||
|
||||
To install the command line tool, and set up a project and billing, see the
|
||||
quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line).
|
||||
|
||||
You'll also need a Google Cloud Storage bucket for the data. If you followed the
|
||||
instructions above, you can just run:
|
||||
|
||||
```
|
||||
MY_BUCKET=gs://<my-bucket-name>
|
||||
gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/
|
||||
```
|
||||
|
||||
Then run the following command from the `tutorials/image` directory of this
|
||||
repository (the parent directory of this README):
|
||||
|
||||
```
|
||||
gcloud ml-engine jobs submit training cifarmultigpu \
|
||||
--runtime-version 1.2 \
|
||||
--job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \
|
||||
--config cifar10_estimator/cmle_config.yaml \
|
||||
--package-path cifar10_estimator/ \
|
||||
--module-name cifar10_estimator.cifar10_main \
|
||||
-- \
|
||||
--data-dir=$MY_BUCKET/cifar-10-data \
|
||||
--num-gpus=4 \
|
||||
--train-steps=1000
|
||||
```
|
||||
|
||||
|
||||
### Set TF_CONFIG
|
||||
|
||||
Considering that you already have multiple hosts configured, all you need is a
|
||||
`TF_CONFIG` environment variable on each host. You can set up the hosts manually
|
||||
or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for
|
||||
instructions about how to set up a Cluster.
|
||||
|
||||
The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and
|
||||
their task: `master`, `ps` or `worker`.
|
||||
|
||||
Here's an example of `TF_CONFIG`.
|
||||
|
||||
```python
|
||||
cluster = {'master': ['master-ip:8000'],
|
||||
'ps': ['ps-ip:8000'],
|
||||
'worker': ['worker-ip:8000']}
|
||||
|
||||
TF_CONFIG = json.dumps(
|
||||
{'cluster': cluster,
|
||||
'task': {'type': master, 'index': 0},
|
||||
'model_dir': 'gs://<bucket_path>/<dir_path>',
|
||||
'environment': 'cloud'
|
||||
})
|
||||
```
|
||||
|
||||
*Cluster*
|
||||
|
||||
A cluster spec, which is basically a dictionary that describes all of the tasks
|
||||
in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed).
|
||||
|
||||
In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker.
|
||||
|
||||
* `ps`: saves the parameters among all workers. All workers can
|
||||
read/write/update the parameters for model via ps. As some models are
|
||||
extremely large the parameters are shared among the ps (each ps stores a
|
||||
subset).
|
||||
|
||||
* `worker`: does the training.
|
||||
|
||||
* `master`: basically a special worker, it does training, but also restores and
|
||||
saves checkpoints and do evaluation.
|
||||
|
||||
*Task*
|
||||
|
||||
The Task defines what is the role of the current node, for this example the node
|
||||
is the master on index 0 on the cluster spec, the task will be different for
|
||||
each node. An example of the `TF_CONFIG` for a worker would be:
|
||||
|
||||
```python
|
||||
cluster = {'master': ['master-ip:8000'],
|
||||
'ps': ['ps-ip:8000'],
|
||||
'worker': ['worker-ip:8000']}
|
||||
|
||||
TF_CONFIG = json.dumps(
|
||||
{'cluster': cluster,
|
||||
'task': {'type': worker, 'index': 0},
|
||||
'model_dir': 'gs://<bucket_path>/<dir_path>',
|
||||
'environment': 'cloud'
|
||||
})
|
||||
```
|
||||
|
||||
*Model_dir*
|
||||
|
||||
This is the path where the master will save the checkpoints, graph and
|
||||
TensorBoard files. For a multi host environment you may want to use a
|
||||
Distributed File System, Google Storage and DFS are supported.
|
||||
|
||||
*Environment*
|
||||
|
||||
By the default environment is *local*, for a distributed setting we need to
|
||||
change it to *cloud*.
|
||||
|
||||
### Running script
|
||||
|
||||
Once you have a `TF_CONFIG` configured properly on each host you're ready to run
|
||||
on distributed settings.
|
||||
|
||||
#### Master
|
||||
Run this on master:
|
||||
Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
|
||||
40000 steps. It will run evaluation a couple of times during training. The
|
||||
num_workers arugument is used only to update the learning rate correctly. Make
|
||||
sure the model_dir is the same as defined on the TF_CONFIG.
|
||||
|
||||
```shell
|
||||
python cifar10_main.py --data-dir=gs://path/cifar-10-data \
|
||||
--job-dir=gs://path/model_dir/ \
|
||||
--num-gpus=4 \
|
||||
--train-steps=40000 \
|
||||
--sync \
|
||||
--num-workers=2
|
||||
```
|
||||
|
||||
*Output:*
|
||||
|
||||
```shell
|
||||
INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
|
||||
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd16fb2be10>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
|
||||
gpu_options {
|
||||
}
|
||||
allow_soft_placement: true
|
||||
, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
|
||||
per_process_gpu_memory_fraction: 1.0
|
||||
}
|
||||
, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
|
||||
...
|
||||
2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
|
||||
name: Tesla K80
|
||||
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
|
||||
pciBusID 0000:00:04.0
|
||||
Total memory: 11.17GiB
|
||||
Free memory: 11.09GiB
|
||||
2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
|
||||
name: Tesla K80
|
||||
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
|
||||
pciBusID 0000:00:05.0
|
||||
Total memory: 11.17GiB
|
||||
Free memory: 11.10GiB
|
||||
...
|
||||
2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
|
||||
INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1
|
||||
INFO:tensorflow:Create CheckpointSaverHook.
|
||||
INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0
|
||||
2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config:
|
||||
intra_op_parallelism_threads: 1
|
||||
gpu_options {
|
||||
per_process_gpu_memory_fraction: 1
|
||||
}
|
||||
allow_soft_placement: true
|
||||
|
||||
INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt.
|
||||
INFO:tensorflow:loss = 1.20682, step = 1
|
||||
INFO:tensorflow:loss = 1.20682, learning_rate = 0.1
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
|
||||
INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
|
||||
INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14
|
||||
2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0)
|
||||
2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0)
|
||||
2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0)
|
||||
2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0)
|
||||
2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0)
|
||||
2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0)
|
||||
2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0)
|
||||
2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0)
|
||||
INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023
|
||||
INFO:tensorflow:Evaluation [1/100]
|
||||
INFO:tensorflow:Evaluation [2/100]
|
||||
INFO:tensorflow:Evaluation [3/100]
|
||||
INFO:tensorflow:Evaluation [4/100]
|
||||
INFO:tensorflow:Evaluation [5/100]
|
||||
INFO:tensorflow:Evaluation [6/100]
|
||||
INFO:tensorflow:Evaluation [7/100]
|
||||
INFO:tensorflow:Evaluation [8/100]
|
||||
INFO:tensorflow:Evaluation [9/100]
|
||||
INFO:tensorflow:Evaluation [10/100]
|
||||
INFO:tensorflow:Evaluation [11/100]
|
||||
INFO:tensorflow:Evaluation [12/100]
|
||||
INFO:tensorflow:Evaluation [13/100]
|
||||
...
|
||||
INFO:tensorflow:Evaluation [100/100]
|
||||
INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31
|
||||
INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425
|
||||
```
|
||||
|
||||
#### Worker
|
||||
|
||||
Run this on worker:
|
||||
Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
|
||||
40000 steps. It will run evaluation a couple of times during training. Make sure
|
||||
the model_dir is the same as defined on the TF_CONFIG.
|
||||
|
||||
```shell
|
||||
python cifar10_main.py --data-dir=gs://path/cifar-10-data \
|
||||
--job-dir=gs://path/model_dir/ \
|
||||
--num-gpus=4 \
|
||||
--train-steps=40000 \
|
||||
--sync
|
||||
```
|
||||
|
||||
*Output:*
|
||||
|
||||
```shell
|
||||
INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
|
||||
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600,
|
||||
'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker',
|
||||
'_is_chief': False, '_cluster_spec':
|
||||
<tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6918438e10>,
|
||||
'_model_dir': 'gs://<path>/model_dir/',
|
||||
'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000,
|
||||
'_session_config': intra_op_parallelism_threads: 1
|
||||
gpu_options {
|
||||
}
|
||||
allow_soft_placement: true
|
||||
, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1,
|
||||
'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
|
||||
per_process_gpu_memory_fraction: 1.0
|
||||
}
|
||||
...
|
||||
2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
|
||||
name: Tesla K80
|
||||
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
|
||||
pciBusID 0000:00:04.0
|
||||
Total memory: 11.17GiB
|
||||
Free memory: 11.09GiB
|
||||
2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
|
||||
name: Tesla K80
|
||||
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
|
||||
pciBusID 0000:00:05.0
|
||||
Total memory: 11.17GiB
|
||||
Free memory: 11.10GiB
|
||||
...
|
||||
2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
|
||||
INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
|
||||
INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
|
||||
INFO:tensorflow:Create CheckpointSaverHook.
|
||||
2017-07-31 22:38:04.629150: I
|
||||
tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting
|
||||
for response from worker: /job:master/replica:0/task:0
|
||||
2017-07-31 22:38:09.263492: I
|
||||
tensorflow/core/distributed_runtime/master_session.cc:999] Start master
|
||||
session cc58f93b1e259b0c with config:
|
||||
intra_op_parallelism_threads: 1
|
||||
gpu_options {
|
||||
per_process_gpu_memory_fraction: 1
|
||||
}
|
||||
allow_soft_placement: true
|
||||
INFO:tensorflow:loss = 5.82382, step = 0
|
||||
INFO:tensorflow:loss = 5.82382, learning_rate = 0.8
|
||||
INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10
|
||||
INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20
|
||||
INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30
|
||||
INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40
|
||||
INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50
|
||||
INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60
|
||||
INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70
|
||||
INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec)
|
||||
INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec)
|
||||
INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80
|
||||
INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90
|
||||
INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100
|
||||
INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110
|
||||
INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120
|
||||
INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130
|
||||
INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140
|
||||
INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150
|
||||
INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160
|
||||
INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170
|
||||
INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
|
||||
...
|
||||
```
|
||||
|
||||
#### PS
|
||||
|
||||
Run this on ps:
|
||||
The ps will not do training so most of the arguments won't affect the execution
|
||||
|
||||
```shell
|
||||
python cifar10_main.py --job-dir=gs://path/model_dir/
|
||||
```
|
||||
|
||||
*Output:*
|
||||
|
||||
```shell
|
||||
INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
|
||||
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f48f1addf90>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
|
||||
gpu_options {
|
||||
}
|
||||
allow_soft_placement: true
|
||||
, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
|
||||
per_process_gpu_memory_fraction: 1.0
|
||||
}
|
||||
, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
|
||||
2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000}
|
||||
2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
|
||||
2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000}
|
||||
2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
|
||||
```
|
||||
|
||||
## Visualizing results with TensorBoard
|
||||
|
||||
When using Estimators you can also visualize your data in TensorBoard, with no
|
||||
changes in your code. You can use TensorBoard to visualize your TensorFlow
|
||||
graph, plot quantitative metrics about the execution of your graph, and show
|
||||
additional data like images that pass through it.
|
||||
|
||||
You'll see something similar to this if you "point" TensorBoard to the
|
||||
`job dir` parameter you used to train or evaluate your model.
|
||||
|
||||
Check TensorBoard during training or after it. Just point TensorBoard to the
|
||||
model_dir you chose on the previous step.
|
||||
|
||||
```shell
|
||||
tensorboard --log-dir="<job dir>"
|
||||
```
|
||||
|
||||
## Warnings
|
||||
|
||||
When runninng `cifar10_main.py` with `--sync` argument you may see an error
|
||||
similar to:
|
||||
|
||||
```python
|
||||
File "cifar10_main.py", line 538, in <module>
|
||||
tf.app.run()
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
|
||||
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
|
||||
File "cifar10_main.py", line 518, in main
|
||||
hooks), run_config=config)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run
|
||||
return _execute_schedule(experiment, schedule)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule
|
||||
return task()
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate
|
||||
hooks=self._eval_hooks)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate
|
||||
hooks=hooks)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate
|
||||
name=name)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model
|
||||
features, labels, model_fn_lib.ModeKeys.EVAL)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn
|
||||
features=features, labels=labels, **kwargs)
|
||||
File "cifar10_main.py", line 331, in _resnet_model_fn
|
||||
gradvars, global_step=tf.train.get_global_step())
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients
|
||||
variables.global_variables())
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped
|
||||
return _add_should_use_warning(fn(*args, **kwargs))
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning
|
||||
wrapped = TFShouldUseWarningWrapper(x)
|
||||
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__
|
||||
stack = [s.strip() for s in traceback.format_stack()]
|
||||
```
|
||||
|
||||
This should not affect your training, and should be fixed on the next releases.
|
@ -0,0 +1,113 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""CIFAR-10 data set.
|
||||
|
||||
See http://www.cs.toronto.edu/~kriz/cifar.html.
|
||||
"""
|
||||
import os
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
HEIGHT = 32
|
||||
WIDTH = 32
|
||||
DEPTH = 3
|
||||
|
||||
|
||||
class Cifar10DataSet(object):
|
||||
"""Cifar10 data set.
|
||||
|
||||
Described by http://www.cs.toronto.edu/~kriz/cifar.html.
|
||||
"""
|
||||
|
||||
def __init__(self, data_dir, subset='train', use_distortion=True):
|
||||
self.data_dir = data_dir
|
||||
self.subset = subset
|
||||
self.use_distortion = use_distortion
|
||||
|
||||
def get_filenames(self):
|
||||
if self.subset in ['train', 'validation', 'eval']:
|
||||
return [os.path.join(self.data_dir, self.subset + '.tfrecords')]
|
||||
else:
|
||||
raise ValueError('Invalid data subset "%s"' % self.subset)
|
||||
|
||||
def parser(self, serialized_example):
|
||||
"""Parses a single tf.Example into image and label tensors."""
|
||||
# Dimensions of the images in the CIFAR-10 dataset.
|
||||
# See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
|
||||
# input format.
|
||||
features = tf.parse_single_example(
|
||||
serialized_example,
|
||||
features={
|
||||
'image': tf.FixedLenFeature([], tf.string),
|
||||
'label': tf.FixedLenFeature([], tf.int64),
|
||||
})
|
||||
image = tf.decode_raw(features['image'], tf.uint8)
|
||||
image.set_shape([DEPTH * HEIGHT * WIDTH])
|
||||
|
||||
# Reshape from [depth * height * width] to [depth, height, width].
|
||||
image = tf.cast(
|
||||
tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
|
||||
tf.float32)
|
||||
label = tf.cast(features['label'], tf.int32)
|
||||
|
||||
# Custom preprocessing.
|
||||
image = self.preprocess(image)
|
||||
|
||||
return image, label
|
||||
|
||||
def make_batch(self, batch_size):
|
||||
"""Read the images and labels from 'filenames'."""
|
||||
filenames = self.get_filenames()
|
||||
# Repeat infinitely.
|
||||
dataset = tf.data.TFRecordDataset(filenames).repeat()
|
||||
|
||||
# Parse records.
|
||||
dataset = dataset.map(
|
||||
self.parser)
|
||||
|
||||
# Potentially shuffle records.
|
||||
if self.subset == 'train':
|
||||
min_queue_examples = int(
|
||||
Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4)
|
||||
# Ensure that the capacity is sufficiently large to provide good random
|
||||
# shuffling.
|
||||
dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)
|
||||
|
||||
# Batch it up.
|
||||
dataset = dataset.batch(batch_size)
|
||||
iterator = dataset.make_one_shot_iterator()
|
||||
image_batch, label_batch = iterator.get_next()
|
||||
|
||||
return image_batch, label_batch
|
||||
|
||||
def preprocess(self, image):
|
||||
"""Preprocess a single image in [height, width, depth] layout."""
|
||||
if self.subset == 'train' and self.use_distortion:
|
||||
# Pad 4 pixels on each dimension of feature map, done in mini-batch
|
||||
image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
|
||||
image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
|
||||
image = tf.image.random_flip_left_right(image)
|
||||
return image
|
||||
|
||||
@staticmethod
|
||||
def num_examples_per_epoch(subset='train'):
|
||||
if subset == 'train':
|
||||
return 45000
|
||||
elif subset == 'validation':
|
||||
return 5000
|
||||
elif subset == 'eval':
|
||||
return 10000
|
||||
else:
|
||||
raise ValueError('Invalid data subset "%s"' % subset)
|
@ -0,0 +1,521 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""ResNet model for classifying images from CIFAR-10 dataset.
|
||||
|
||||
Support single-host training with one or multiple devices.
|
||||
|
||||
ResNet as proposed in:
|
||||
Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
|
||||
Deep Residual Learning for Image Recognition. arXiv:1512.03385
|
||||
|
||||
CIFAR-10 as in:
|
||||
http://www.cs.toronto.edu/~kriz/cifar.html
|
||||
|
||||
|
||||
"""
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import functools
|
||||
import itertools
|
||||
import os
|
||||
|
||||
import cifar10
|
||||
import cifar10_model
|
||||
import cifar10_utils
|
||||
import numpy as np
|
||||
import six
|
||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
import tensorflow as tf
|
||||
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
|
||||
def get_model_fn(num_gpus, variable_strategy, num_workers):
|
||||
"""Returns a function that will build the resnet model."""
|
||||
|
||||
def _resnet_model_fn(features, labels, mode, params):
|
||||
"""Resnet model body.
|
||||
|
||||
Support single host, one or more GPU training. Parameter distribution can
|
||||
be either one of the following scheme.
|
||||
1. CPU is the parameter server and manages gradient updates.
|
||||
2. Parameters are distributed evenly across all GPUs, and the first GPU
|
||||
manages gradient updates.
|
||||
|
||||
Args:
|
||||
features: a list of tensors, one for each tower
|
||||
labels: a list of tensors, one for each tower
|
||||
mode: ModeKeys.TRAIN or EVAL
|
||||
params: Hyperparameters suitable for tuning
|
||||
Returns:
|
||||
A EstimatorSpec object.
|
||||
"""
|
||||
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
|
||||
weight_decay = params.weight_decay
|
||||
momentum = params.momentum
|
||||
|
||||
tower_features = features
|
||||
tower_labels = labels
|
||||
tower_losses = []
|
||||
tower_gradvars = []
|
||||
tower_preds = []
|
||||
|
||||
# channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
|
||||
# on CPU. The exception is Intel MKL on CPU which is optimal with
|
||||
# channels_last.
|
||||
data_format = params.data_format
|
||||
if not data_format:
|
||||
if num_gpus == 0:
|
||||
data_format = 'channels_last'
|
||||
else:
|
||||
data_format = 'channels_first'
|
||||
|
||||
if num_gpus == 0:
|
||||
num_devices = 1
|
||||
device_type = 'cpu'
|
||||
else:
|
||||
num_devices = num_gpus
|
||||
device_type = 'gpu'
|
||||
|
||||
for i in range(num_devices):
|
||||
worker_device = '/{}:{}'.format(device_type, i)
|
||||
if variable_strategy == 'CPU':
|
||||
device_setter = cifar10_utils.local_device_setter(
|
||||
worker_device=worker_device)
|
||||
elif variable_strategy == 'GPU':
|
||||
device_setter = cifar10_utils.local_device_setter(
|
||||
ps_device_type='gpu',
|
||||
worker_device=worker_device,
|
||||
ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
|
||||
num_gpus, tf.contrib.training.byte_size_load_fn))
|
||||
with tf.variable_scope('resnet', reuse=bool(i != 0)):
|
||||
with tf.name_scope('tower_%d' % i) as name_scope:
|
||||
with tf.device(device_setter):
|
||||
loss, gradvars, preds = _tower_fn(
|
||||
is_training, weight_decay, tower_features[i], tower_labels[i],
|
||||
data_format, params.num_layers, params.batch_norm_decay,
|
||||
params.batch_norm_epsilon)
|
||||
tower_losses.append(loss)
|
||||
tower_gradvars.append(gradvars)
|
||||
tower_preds.append(preds)
|
||||
if i == 0:
|
||||
# Only trigger batch_norm moving mean and variance update from
|
||||
# the 1st tower. Ideally, we should grab the updates from all
|
||||
# towers but these stats accumulate extremely fast so we can
|
||||
# ignore the other stats from the other towers without
|
||||
# significant detriment.
|
||||
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
|
||||
name_scope)
|
||||
|
||||
# Now compute global loss and gradients.
|
||||
gradvars = []
|
||||
with tf.name_scope('gradient_averaging'):
|
||||
all_grads = {}
|
||||
for grad, var in itertools.chain(*tower_gradvars):
|
||||
if grad is not None:
|
||||
all_grads.setdefault(var, []).append(grad)
|
||||
for var, grads in six.iteritems(all_grads):
|
||||
# Average gradients on the same device as the variables
|
||||
# to which they apply.
|
||||
with tf.device(var.device):
|
||||
if len(grads) == 1:
|
||||
avg_grad = grads[0]
|
||||
else:
|
||||
avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
|
||||
gradvars.append((avg_grad, var))
|
||||
|
||||
# Device that runs the ops to apply global gradient updates.
|
||||
consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
|
||||
with tf.device(consolidation_device):
|
||||
# Suggested learning rate scheduling from
|
||||
# https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
|
||||
num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
|
||||
'train') // (params.train_batch_size * num_workers)
|
||||
boundaries = [
|
||||
num_batches_per_epoch * x
|
||||
for x in np.array([82, 123, 300], dtype=np.int64)
|
||||
]
|
||||
staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
|
||||
|
||||
learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
|
||||
boundaries, staged_lr)
|
||||
|
||||
loss = tf.reduce_mean(tower_losses, name='loss')
|
||||
|
||||
examples_sec_hook = cifar10_utils.ExamplesPerSecondHook(
|
||||
params.train_batch_size, every_n_steps=10)
|
||||
|
||||
tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}
|
||||
|
||||
logging_hook = tf.train.LoggingTensorHook(
|
||||
tensors=tensors_to_log, every_n_iter=100)
|
||||
|
||||
train_hooks = [logging_hook, examples_sec_hook]
|
||||
|
||||
optimizer = tf.train.MomentumOptimizer(
|
||||
learning_rate=learning_rate, momentum=momentum)
|
||||
|
||||
if params.sync:
|
||||
optimizer = tf.train.SyncReplicasOptimizer(
|
||||
optimizer, replicas_to_aggregate=num_workers)
|
||||
sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief)
|
||||
train_hooks.append(sync_replicas_hook)
|
||||
|
||||
# Create single grouped train op
|
||||
train_op = [
|
||||
optimizer.apply_gradients(
|
||||
gradvars, global_step=tf.train.get_global_step())
|
||||
]
|
||||
train_op.extend(update_ops)
|
||||
train_op = tf.group(*train_op)
|
||||
|
||||
predictions = {
|
||||
'classes':
|
||||
tf.concat([p['classes'] for p in tower_preds], axis=0),
|
||||
'probabilities':
|
||||
tf.concat([p['probabilities'] for p in tower_preds], axis=0)
|
||||
}
|
||||
stacked_labels = tf.concat(labels, axis=0)
|
||||
metrics = {
|
||||
'accuracy':
|
||||
tf.metrics.accuracy(stacked_labels, predictions['classes'])
|
||||
}
|
||||
|
||||
return tf.estimator.EstimatorSpec(
|
||||
mode=mode,
|
||||
predictions=predictions,
|
||||
loss=loss,
|
||||
train_op=train_op,
|
||||
training_hooks=train_hooks,
|
||||
eval_metric_ops=metrics)
|
||||
|
||||
return _resnet_model_fn
|
||||
|
||||
|
||||
def _tower_fn(is_training, weight_decay, feature, label, data_format,
|
||||
num_layers, batch_norm_decay, batch_norm_epsilon):
|
||||
"""Build computation tower (Resnet).
|
||||
|
||||
Args:
|
||||
is_training: true if is training graph.
|
||||
weight_decay: weight regularization strength, a float.
|
||||
feature: a Tensor.
|
||||
label: a Tensor.
|
||||
data_format: channels_last (NHWC) or channels_first (NCHW).
|
||||
num_layers: number of layers, an int.
|
||||
batch_norm_decay: decay for batch normalization, a float.
|
||||
batch_norm_epsilon: epsilon for batch normalization, a float.
|
||||
|
||||
Returns:
|
||||
A tuple with the loss for the tower, the gradients and parameters, and
|
||||
predictions.
|
||||
|
||||
"""
|
||||
model = cifar10_model.ResNetCifar10(
|
||||
num_layers,
|
||||
batch_norm_decay=batch_norm_decay,
|
||||
batch_norm_epsilon=batch_norm_epsilon,
|
||||
is_training=is_training,
|
||||
data_format=data_format)
|
||||
logits = model.forward_pass(feature, input_data_format='channels_last')
|
||||
tower_pred = {
|
||||
'classes': tf.argmax(input=logits, axis=1),
|
||||
'probabilities': tf.nn.softmax(logits)
|
||||
}
|
||||
|
||||
tower_loss = tf.losses.sparse_softmax_cross_entropy(
|
||||
logits=logits, labels=label)
|
||||
tower_loss = tf.reduce_mean(tower_loss)
|
||||
|
||||
model_params = tf.trainable_variables()
|
||||
tower_loss += weight_decay * tf.add_n(
|
||||
[tf.nn.l2_loss(v) for v in model_params])
|
||||
|
||||
tower_grad = tf.gradients(tower_loss, model_params)
|
||||
|
||||
return tower_loss, zip(tower_grad, model_params), tower_pred
|
||||
|
||||
|
||||
def input_fn(data_dir,
|
||||
subset,
|
||||
num_shards,
|
||||
batch_size,
|
||||
use_distortion_for_training=True):
|
||||
"""Create input graph for model.
|
||||
|
||||
Args:
|
||||
data_dir: Directory where TFRecords representing the dataset are located.
|
||||
subset: one of 'train', 'validate' and 'eval'.
|
||||
num_shards: num of towers participating in data-parallel training.
|
||||
batch_size: total batch size for training to be divided by the number of
|
||||
shards.
|
||||
use_distortion_for_training: True to use distortions.
|
||||
Returns:
|
||||
two lists of tensors for features and labels, each of num_shards length.
|
||||
"""
|
||||
with tf.device('/cpu:0'):
|
||||
use_distortion = subset == 'train' and use_distortion_for_training
|
||||
dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion)
|
||||
image_batch, label_batch = dataset.make_batch(batch_size)
|
||||
if num_shards <= 1:
|
||||
# No GPU available or only 1 GPU.
|
||||
return [image_batch], [label_batch]
|
||||
|
||||
# Note that passing num=batch_size is safe here, even though
|
||||
# dataset.batch(batch_size) can, in some cases, return fewer than batch_size
|
||||
# examples. This is because it does so only when repeating for a limited
|
||||
# number of epochs, but our dataset repeats forever.
|
||||
image_batch = tf.unstack(image_batch, num=batch_size, axis=0)
|
||||
label_batch = tf.unstack(label_batch, num=batch_size, axis=0)
|
||||
feature_shards = [[] for i in range(num_shards)]
|
||||
label_shards = [[] for i in range(num_shards)]
|
||||
for i in xrange(batch_size):
|
||||
idx = i % num_shards
|
||||
feature_shards[idx].append(image_batch[i])
|
||||
label_shards[idx].append(label_batch[i])
|
||||
feature_shards = [tf.parallel_stack(x) for x in feature_shards]
|
||||
label_shards = [tf.parallel_stack(x) for x in label_shards]
|
||||
return feature_shards, label_shards
|
||||
|
||||
|
||||
def get_experiment_fn(data_dir,
|
||||
num_gpus,
|
||||
variable_strategy,
|
||||
use_distortion_for_training=True):
|
||||
"""Returns an Experiment function.
|
||||
|
||||
Experiments perform training on several workers in parallel,
|
||||
in other words experiments know how to invoke train and eval in a sensible
|
||||
fashion for distributed training. Arguments passed directly to this
|
||||
function are not tunable, all other arguments should be passed within
|
||||
tf.HParams, passed to the enclosed function.
|
||||
|
||||
Args:
|
||||
data_dir: str. Location of the data for input_fns.
|
||||
num_gpus: int. Number of GPUs on each worker.
|
||||
variable_strategy: String. CPU to use CPU as the parameter server
|
||||
and GPU to use the GPUs as the parameter server.
|
||||
use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
|
||||
Returns:
|
||||
A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
|
||||
tf.contrib.learn.Experiment.
|
||||
|
||||
Suitable for use by tf.contrib.learn.learn_runner, which will run various
|
||||
methods on Experiment (train, evaluate) based on information
|
||||
about the current runner in `run_config`.
|
||||
"""
|
||||
|
||||
def _experiment_fn(run_config, hparams):
|
||||
"""Returns an Experiment."""
|
||||
# Create estimator.
|
||||
train_input_fn = functools.partial(
|
||||
input_fn,
|
||||
data_dir,
|
||||
subset='train',
|
||||
num_shards=num_gpus,
|
||||
batch_size=hparams.train_batch_size,
|
||||
use_distortion_for_training=use_distortion_for_training)
|
||||
|
||||
eval_input_fn = functools.partial(
|
||||
input_fn,
|
||||
data_dir,
|
||||
subset='eval',
|
||||
batch_size=hparams.eval_batch_size,
|
||||
num_shards=num_gpus)
|
||||
|
||||
num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
|
||||
if num_eval_examples % hparams.eval_batch_size != 0:
|
||||
raise ValueError(
|
||||
'validation set size must be multiple of eval_batch_size')
|
||||
|
||||
train_steps = hparams.train_steps
|
||||
eval_steps = num_eval_examples // hparams.eval_batch_size
|
||||
|
||||
classifier = tf.estimator.Estimator(
|
||||
model_fn=get_model_fn(num_gpus, variable_strategy,
|
||||
run_config.num_worker_replicas or 1),
|
||||
config=run_config,
|
||||
params=hparams)
|
||||
|
||||
# Create experiment.
|
||||
return tf.contrib.learn.Experiment(
|
||||
classifier,
|
||||
train_input_fn=train_input_fn,
|
||||
eval_input_fn=eval_input_fn,
|
||||
train_steps=train_steps,
|
||||
eval_steps=eval_steps)
|
||||
|
||||
return _experiment_fn
|
||||
|
||||
|
||||
def main(job_dir, data_dir, num_gpus, variable_strategy,
|
||||
use_distortion_for_training, log_device_placement, num_intra_threads,
|
||||
**hparams):
|
||||
# The env variable is on deprecation path, default is set to off.
|
||||
os.environ['TF_SYNC_ON_FINISH'] = '0'
|
||||
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
|
||||
|
||||
# Session configuration.
|
||||
sess_config = tf.ConfigProto(
|
||||
allow_soft_placement=True,
|
||||
log_device_placement=log_device_placement,
|
||||
intra_op_parallelism_threads=num_intra_threads,
|
||||
gpu_options=tf.GPUOptions(force_gpu_compatible=True))
|
||||
|
||||
config = cifar10_utils.RunConfig(
|
||||
session_config=sess_config, model_dir=job_dir)
|
||||
tf.contrib.learn.learn_runner.run(
|
||||
get_experiment_fn(data_dir, num_gpus, variable_strategy,
|
||||
use_distortion_for_training),
|
||||
run_config=config,
|
||||
hparams=tf.contrib.training.HParams(
|
||||
is_chief=config.is_chief,
|
||||
**hparams))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--data-dir',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The directory where the CIFAR-10 input data is stored.')
|
||||
parser.add_argument(
|
||||
'--job-dir',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The directory where the model will be stored.')
|
||||
parser.add_argument(
|
||||
'--variable-strategy',
|
||||
choices=['CPU', 'GPU'],
|
||||
type=str,
|
||||
default='CPU',
|
||||
help='Where to locate variable operations')
|
||||
parser.add_argument(
|
||||
'--num-gpus',
|
||||
type=int,
|
||||
default=1,
|
||||
help='The number of gpus used. Uses only CPU if set to 0.')
|
||||
parser.add_argument(
|
||||
'--num-layers',
|
||||
type=int,
|
||||
default=44,
|
||||
help='The number of layers of the model.')
|
||||
parser.add_argument(
|
||||
'--train-steps',
|
||||
type=int,
|
||||
default=80000,
|
||||
help='The number of steps to use for training.')
|
||||
parser.add_argument(
|
||||
'--train-batch-size',
|
||||
type=int,
|
||||
default=128,
|
||||
help='Batch size for training.')
|
||||
parser.add_argument(
|
||||
'--eval-batch-size',
|
||||
type=int,
|
||||
default=100,
|
||||
help='Batch size for validation.')
|
||||
parser.add_argument(
|
||||
'--momentum',
|
||||
type=float,
|
||||
default=0.9,
|
||||
help='Momentum for MomentumOptimizer.')
|
||||
parser.add_argument(
|
||||
'--weight-decay',
|
||||
type=float,
|
||||
default=2e-4,
|
||||
help='Weight decay for convolutions.')
|
||||
parser.add_argument(
|
||||
'--learning-rate',
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="""\
|
||||
This is the inital learning rate value. The learning rate will decrease
|
||||
during training. For more details check the model_fn implementation in
|
||||
this file.\
|
||||
""")
|
||||
parser.add_argument(
|
||||
'--use-distortion-for-training',
|
||||
type=bool,
|
||||
default=True,
|
||||
help='If doing image distortion for training.')
|
||||
parser.add_argument(
|
||||
'--sync',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="""\
|
||||
If present when running in a distributed environment will run on sync mode.\
|
||||
""")
|
||||
parser.add_argument(
|
||||
'--num-intra-threads',
|
||||
type=int,
|
||||
default=0,
|
||||
help="""\
|
||||
Number of threads to use for intra-op parallelism. When training on CPU
|
||||
set to 0 to have the system pick the appropriate number or alternatively
|
||||
set it to the number of physical CPU cores.\
|
||||
""")
|
||||
parser.add_argument(
|
||||
'--num-inter-threads',
|
||||
type=int,
|
||||
default=0,
|
||||
help="""\
|
||||
Number of threads to use for inter-op parallelism. If set to 0, the
|
||||
system will pick an appropriate number.\
|
||||
""")
|
||||
parser.add_argument(
|
||||
'--data-format',
|
||||
type=str,
|
||||
default=None,
|
||||
help="""\
|
||||
If not set, the data format best for the training device is used.
|
||||
Allowed values: channels_first (NCHW) channels_last (NHWC).\
|
||||
""")
|
||||
parser.add_argument(
|
||||
'--log-device-placement',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Whether to log device placement.')
|
||||
parser.add_argument(
|
||||
'--batch-norm-decay',
|
||||
type=float,
|
||||
default=0.997,
|
||||
help='Decay for batch norm.')
|
||||
parser.add_argument(
|
||||
'--batch-norm-epsilon',
|
||||
type=float,
|
||||
default=1e-5,
|
||||
help='Epsilon for batch norm.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.num_gpus > 0:
|
||||
assert tf.test.is_gpu_available(), "Requested GPUs but none found."
|
||||
if args.num_gpus < 0:
|
||||
raise ValueError(
|
||||
'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
|
||||
if args.num_gpus == 0 and args.variable_strategy == 'GPU':
|
||||
raise ValueError('num-gpus=0, CPU must be used as parameter server. Set'
|
||||
'--variable-strategy=CPU.')
|
||||
if (args.num_layers - 2) % 6 != 0:
|
||||
raise ValueError('Invalid --num-layers parameter.')
|
||||
if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
|
||||
raise ValueError('--train-batch-size must be multiple of --num-gpus.')
|
||||
if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
|
||||
raise ValueError('--eval-batch-size must be multiple of --num-gpus.')
|
||||
|
||||
main(**vars(args))
|
@ -0,0 +1,80 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Model class for Cifar10 Dataset."""
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
import model_base
|
||||
|
||||
|
||||
class ResNetCifar10(model_base.ResNet):
|
||||
"""Cifar10 model with ResNetV1 and basic residual block."""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
is_training,
|
||||
batch_norm_decay,
|
||||
batch_norm_epsilon,
|
||||
data_format='channels_first'):
|
||||
super(ResNetCifar10, self).__init__(
|
||||
is_training,
|
||||
data_format,
|
||||
batch_norm_decay,
|
||||
batch_norm_epsilon
|
||||
)
|
||||
self.n = (num_layers - 2) // 6
|
||||
# Add one in case label starts with 1. No impact if label starts with 0.
|
||||
self.num_classes = 10 + 1
|
||||
self.filters = [16, 16, 32, 64]
|
||||
self.strides = [1, 2, 2]
|
||||
|
||||
def forward_pass(self, x, input_data_format='channels_last'):
|
||||
"""Build the core model within the graph."""
|
||||
if self._data_format != input_data_format:
|
||||
if input_data_format == 'channels_last':
|
||||
# Computation requires channels_first.
|
||||
x = tf.transpose(x, [0, 3, 1, 2])
|
||||
else:
|
||||
# Computation requires channels_last.
|
||||
x = tf.transpose(x, [0, 2, 3, 1])
|
||||
|
||||
# Image standardization.
|
||||
x = x / 128 - 1
|
||||
|
||||
x = self._conv(x, 3, 16, 1)
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
|
||||
# Use basic (non-bottleneck) block and ResNet V1 (post-activation).
|
||||
res_func = self._residual_v1
|
||||
|
||||
# 3 stages of block stacking.
|
||||
for i in range(3):
|
||||
with tf.name_scope('stage'):
|
||||
for j in range(self.n):
|
||||
if j == 0:
|
||||
# First block in a stage, filters and strides may change.
|
||||
x = res_func(x, 3, self.filters[i], self.filters[i + 1],
|
||||
self.strides[i])
|
||||
else:
|
||||
# Following blocks in a stage, constant filters and unit stride.
|
||||
x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1)
|
||||
|
||||
x = self._global_avg_pool(x)
|
||||
x = self._fully_connected(x, self.num_classes)
|
||||
|
||||
return x
|
@ -0,0 +1,154 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import collections
|
||||
import six
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.core.framework import node_def_pb2
|
||||
from tensorflow.python.framework import device as pydev
|
||||
from tensorflow.python.training import basic_session_run_hooks
|
||||
from tensorflow.python.training import session_run_hook
|
||||
from tensorflow.python.training import training_util
|
||||
from tensorflow.python.training import device_setter
|
||||
from tensorflow.contrib.learn.python.learn import run_config
|
||||
|
||||
|
||||
# TODO(b/64848083) Remove once uid bug is fixed
|
||||
class RunConfig(tf.contrib.learn.RunConfig):
|
||||
def uid(self, whitelist=None):
|
||||
"""Generates a 'Unique Identifier' based on all internal fields.
|
||||
Caller should use the uid string to check `RunConfig` instance integrity
|
||||
in one session use, but should not rely on the implementation details, which
|
||||
is subject to change.
|
||||
Args:
|
||||
whitelist: A list of the string names of the properties uid should not
|
||||
include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
|
||||
includes most properties user allowes to change.
|
||||
Returns:
|
||||
A uid string.
|
||||
"""
|
||||
if whitelist is None:
|
||||
whitelist = run_config._DEFAULT_UID_WHITE_LIST
|
||||
|
||||
state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
|
||||
# Pop out the keys in whitelist.
|
||||
for k in whitelist:
|
||||
state.pop('_' + k, None)
|
||||
|
||||
ordered_state = collections.OrderedDict(
|
||||
sorted(state.items(), key=lambda t: t[0]))
|
||||
# For class instance without __repr__, some special cares are required.
|
||||
# Otherwise, the object address will be used.
|
||||
if '_cluster_spec' in ordered_state:
|
||||
ordered_state['_cluster_spec'] = collections.OrderedDict(
|
||||
sorted(ordered_state['_cluster_spec'].as_dict().items(),
|
||||
key=lambda t: t[0])
|
||||
)
|
||||
return ', '.join(
|
||||
'%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
|
||||
|
||||
|
||||
class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
|
||||
"""Hook to print out examples per second.
|
||||
|
||||
Total time is tracked and then divided by the total number of steps
|
||||
to get the average step time and then batch_size is used to determine
|
||||
the running average of examples per second. The examples per second for the
|
||||
most recent interval is also logged.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
batch_size,
|
||||
every_n_steps=100,
|
||||
every_n_secs=None,):
|
||||
"""Initializer for ExamplesPerSecondHook.
|
||||
|
||||
Args:
|
||||
batch_size: Total batch size used to calculate examples/second from
|
||||
global time.
|
||||
every_n_steps: Log stats every n steps.
|
||||
every_n_secs: Log stats every n seconds.
|
||||
"""
|
||||
if (every_n_steps is None) == (every_n_secs is None):
|
||||
raise ValueError('exactly one of every_n_steps'
|
||||
' and every_n_secs should be provided.')
|
||||
self._timer = basic_session_run_hooks.SecondOrStepTimer(
|
||||
every_steps=every_n_steps, every_secs=every_n_secs)
|
||||
|
||||
self._step_train_time = 0
|
||||
self._total_steps = 0
|
||||
self._batch_size = batch_size
|
||||
|
||||
def begin(self):
|
||||
self._global_step_tensor = training_util.get_global_step()
|
||||
if self._global_step_tensor is None:
|
||||
raise RuntimeError(
|
||||
'Global step should be created to use StepCounterHook.')
|
||||
|
||||
def before_run(self, run_context): # pylint: disable=unused-argument
|
||||
return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
|
||||
|
||||
def after_run(self, run_context, run_values):
|
||||
_ = run_context
|
||||
|
||||
global_step = run_values.results
|
||||
if self._timer.should_trigger_for_step(global_step):
|
||||
elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
|
||||
global_step)
|
||||
if elapsed_time is not None:
|
||||
steps_per_sec = elapsed_steps / elapsed_time
|
||||
self._step_train_time += elapsed_time
|
||||
self._total_steps += elapsed_steps
|
||||
|
||||
average_examples_per_sec = self._batch_size * (
|
||||
self._total_steps / self._step_train_time)
|
||||
current_examples_per_sec = steps_per_sec * self._batch_size
|
||||
# Average examples/sec followed by current examples/sec
|
||||
logging.info('%s: %g (%g), step = %g', 'Average examples/sec',
|
||||
average_examples_per_sec, current_examples_per_sec,
|
||||
self._total_steps)
|
||||
|
||||
def local_device_setter(num_devices=1,
|
||||
ps_device_type='cpu',
|
||||
worker_device='/cpu:0',
|
||||
ps_ops=None,
|
||||
ps_strategy=None):
|
||||
if ps_ops == None:
|
||||
ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
|
||||
|
||||
if ps_strategy is None:
|
||||
ps_strategy = device_setter._RoundRobinStrategy(num_devices)
|
||||
if not six.callable(ps_strategy):
|
||||
raise TypeError("ps_strategy must be callable")
|
||||
|
||||
def _local_device_chooser(op):
|
||||
current_device = pydev.DeviceSpec.from_string(op.device or "")
|
||||
|
||||
node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
|
||||
if node_def.op in ps_ops:
|
||||
ps_device_spec = pydev.DeviceSpec.from_string(
|
||||
'/{}:{}'.format(ps_device_type, ps_strategy(op)))
|
||||
|
||||
ps_device_spec.merge_from(current_device)
|
||||
return ps_device_spec.to_string()
|
||||
else:
|
||||
worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
|
||||
worker_device_spec.merge_from(current_device)
|
||||
return worker_device_spec.to_string()
|
||||
return _local_device_chooser
|
@ -0,0 +1,114 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
|
||||
|
||||
Generates tf.train.Example protos and writes them to TFRecord files from the
|
||||
python version of the CIFAR-10 dataset downloaded from
|
||||
https://www.cs.toronto.edu/~kriz/cifar.html.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import tarfile
|
||||
from six.moves import cPickle as pickle
|
||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
import tensorflow as tf
|
||||
|
||||
CIFAR_FILENAME = 'cifar-10-python.tar.gz'
|
||||
CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
|
||||
CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
|
||||
|
||||
|
||||
def download_and_extract(data_dir):
|
||||
# download CIFAR-10 if not already downloaded.
|
||||
tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
|
||||
CIFAR_DOWNLOAD_URL)
|
||||
tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
|
||||
'r:gz').extractall(data_dir)
|
||||
|
||||
|
||||
def _int64_feature(value):
|
||||
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
|
||||
|
||||
|
||||
def _bytes_feature(value):
|
||||
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
|
||||
|
||||
|
||||
def _get_file_names():
|
||||
"""Returns the file names expected to exist in the input_dir."""
|
||||
file_names = {}
|
||||
file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
|
||||
file_names['validation'] = ['data_batch_5']
|
||||
file_names['eval'] = ['test_batch']
|
||||
return file_names
|
||||
|
||||
|
||||
def read_pickle_from_file(filename):
|
||||
with tf.gfile.Open(filename, 'rb') as f:
|
||||
data_dict = pickle.load(f)
|
||||
return data_dict
|
||||
|
||||
|
||||
def convert_to_tfrecord(input_files, output_file):
|
||||
"""Converts a file to TFRecords."""
|
||||
print('Generating %s' % output_file)
|
||||
with tf.python_io.TFRecordWriter(output_file) as record_writer:
|
||||
for input_file in input_files:
|
||||
data_dict = read_pickle_from_file(input_file)
|
||||
data = data_dict['data']
|
||||
labels = data_dict['labels']
|
||||
num_entries_in_batch = len(labels)
|
||||
for i in range(num_entries_in_batch):
|
||||
example = tf.train.Example(features=tf.train.Features(
|
||||
feature={
|
||||
'image': _bytes_feature(data[i].tobytes()),
|
||||
'label': _int64_feature(labels[i])
|
||||
}))
|
||||
record_writer.write(example.SerializeToString())
|
||||
|
||||
|
||||
def main(data_dir):
|
||||
print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
|
||||
download_and_extract(data_dir)
|
||||
file_names = _get_file_names()
|
||||
input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
|
||||
for mode, files in file_names.items():
|
||||
input_files = [os.path.join(input_dir, f) for f in files]
|
||||
output_file = os.path.join(data_dir, mode + '.tfrecords')
|
||||
try:
|
||||
os.remove(output_file)
|
||||
except OSError:
|
||||
pass
|
||||
# Convert to tf.train.Example and write the to TFRecords.
|
||||
convert_to_tfrecord(input_files, output_file)
|
||||
print('Done!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--data-dir',
|
||||
type=str,
|
||||
default='',
|
||||
help='Directory to download and extract CIFAR-10 to.')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.data_dir)
|
@ -0,0 +1,219 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""ResNet model.
|
||||
|
||||
Related papers:
|
||||
https://arxiv.org/pdf/1603.05027v2.pdf
|
||||
https://arxiv.org/pdf/1512.03385v1.pdf
|
||||
https://arxiv.org/pdf/1605.07146v1.pdf
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class ResNet(object):
|
||||
"""ResNet model."""
|
||||
|
||||
def __init__(self, is_training, data_format, batch_norm_decay, batch_norm_epsilon):
|
||||
"""ResNet constructor.
|
||||
|
||||
Args:
|
||||
is_training: if build training or inference model.
|
||||
data_format: the data_format used during computation.
|
||||
one of 'channels_first' or 'channels_last'.
|
||||
"""
|
||||
self._batch_norm_decay = batch_norm_decay
|
||||
self._batch_norm_epsilon = batch_norm_epsilon
|
||||
self._is_training = is_training
|
||||
assert data_format in ('channels_first', 'channels_last')
|
||||
self._data_format = data_format
|
||||
|
||||
def forward_pass(self, x):
|
||||
raise NotImplementedError(
|
||||
'forward_pass() is implemented in ResNet sub classes')
|
||||
|
||||
def _residual_v1(self,
|
||||
x,
|
||||
kernel_size,
|
||||
in_filter,
|
||||
out_filter,
|
||||
stride,
|
||||
activate_before_residual=False):
|
||||
"""Residual unit with 2 sub layers, using Plan A for shortcut connection."""
|
||||
|
||||
del activate_before_residual
|
||||
with tf.name_scope('residual_v1') as name_scope:
|
||||
orig_x = x
|
||||
|
||||
x = self._conv(x, kernel_size, out_filter, stride)
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
|
||||
x = self._conv(x, kernel_size, out_filter, 1)
|
||||
x = self._batch_norm(x)
|
||||
|
||||
if in_filter != out_filter:
|
||||
orig_x = self._avg_pool(orig_x, stride, stride)
|
||||
pad = (out_filter - in_filter) // 2
|
||||
if self._data_format == 'channels_first':
|
||||
orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]])
|
||||
else:
|
||||
orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]])
|
||||
|
||||
x = self._relu(tf.add(x, orig_x))
|
||||
|
||||
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
|
||||
return x
|
||||
|
||||
def _residual_v2(self,
|
||||
x,
|
||||
in_filter,
|
||||
out_filter,
|
||||
stride,
|
||||
activate_before_residual=False):
|
||||
"""Residual unit with 2 sub layers with preactivation, plan A shortcut."""
|
||||
|
||||
with tf.name_scope('residual_v2') as name_scope:
|
||||
if activate_before_residual:
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
orig_x = x
|
||||
else:
|
||||
orig_x = x
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
|
||||
x = self._conv(x, 3, out_filter, stride)
|
||||
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
x = self._conv(x, 3, out_filter, [1, 1, 1, 1])
|
||||
|
||||
if in_filter != out_filter:
|
||||
pad = (out_filter - in_filter) // 2
|
||||
orig_x = self._avg_pool(orig_x, stride, stride)
|
||||
if self._data_format == 'channels_first':
|
||||
orig_x = tf.pad(orig_x, [[0, 0], [pad, pad], [0, 0], [0, 0]])
|
||||
else:
|
||||
orig_x = tf.pad(orig_x, [[0, 0], [0, 0], [0, 0], [pad, pad]])
|
||||
|
||||
x = tf.add(x, orig_x)
|
||||
|
||||
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
|
||||
return x
|
||||
|
||||
def _bottleneck_residual_v2(self,
|
||||
x,
|
||||
in_filter,
|
||||
out_filter,
|
||||
stride,
|
||||
activate_before_residual=False):
|
||||
"""Bottleneck residual unit with 3 sub layers, plan B shortcut."""
|
||||
|
||||
with tf.name_scope('bottle_residual_v2') as name_scope:
|
||||
if activate_before_residual:
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
orig_x = x
|
||||
else:
|
||||
orig_x = x
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
|
||||
x = self._conv(x, 1, out_filter // 4, stride, is_atrous=True)
|
||||
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
# pad when stride isn't unit
|
||||
x = self._conv(x, 3, out_filter // 4, 1, is_atrous=True)
|
||||
|
||||
x = self._batch_norm(x)
|
||||
x = self._relu(x)
|
||||
x = self._conv(x, 1, out_filter, 1, is_atrous=True)
|
||||
|
||||
if in_filter != out_filter:
|
||||
orig_x = self._conv(orig_x, 1, out_filter, stride, is_atrous=True)
|
||||
x = tf.add(x, orig_x)
|
||||
|
||||
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
|
||||
return x
|
||||
|
||||
def _conv(self, x, kernel_size, filters, strides, is_atrous=False):
|
||||
"""Convolution."""
|
||||
|
||||
padding = 'SAME'
|
||||
if not is_atrous and strides > 1:
|
||||
pad = kernel_size - 1
|
||||
pad_beg = pad // 2
|
||||
pad_end = pad - pad_beg
|
||||
if self._data_format == 'channels_first':
|
||||
x = tf.pad(x, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
|
||||
else:
|
||||
x = tf.pad(x, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
|
||||
padding = 'VALID'
|
||||
return tf.layers.conv2d(
|
||||
inputs=x,
|
||||
kernel_size=kernel_size,
|
||||
filters=filters,
|
||||
strides=strides,
|
||||
padding=padding,
|
||||
use_bias=False,
|
||||
data_format=self._data_format)
|
||||
|
||||
def _batch_norm(self, x):
|
||||
if self._data_format == 'channels_first':
|
||||
data_format = 'NCHW'
|
||||
else:
|
||||
data_format = 'NHWC'
|
||||
return tf.contrib.layers.batch_norm(
|
||||
x,
|
||||
decay=self._batch_norm_decay,
|
||||
center=True,
|
||||
scale=True,
|
||||
epsilon=self._batch_norm_epsilon,
|
||||
is_training=self._is_training,
|
||||
fused=True,
|
||||
data_format=data_format)
|
||||
|
||||
def _relu(self, x):
|
||||
return tf.nn.relu(x)
|
||||
|
||||
def _fully_connected(self, x, out_dim):
|
||||
with tf.name_scope('fully_connected') as name_scope:
|
||||
x = tf.layers.dense(x, out_dim)
|
||||
|
||||
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
|
||||
return x
|
||||
|
||||
def _avg_pool(self, x, pool_size, stride):
|
||||
with tf.name_scope('avg_pool') as name_scope:
|
||||
x = tf.layers.average_pooling2d(
|
||||
x, pool_size, stride, 'SAME', data_format=self._data_format)
|
||||
|
||||
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
|
||||
return x
|
||||
|
||||
def _global_avg_pool(self, x):
|
||||
with tf.name_scope('global_avg_pool') as name_scope:
|
||||
assert x.get_shape().ndims == 4
|
||||
if self._data_format == 'channels_first':
|
||||
x = tf.reduce_mean(x, [2, 3])
|
||||
else:
|
||||
x = tf.reduce_mean(x, [1, 2])
|
||||
tf.logging.info('image after unit %s: %s', name_scope, x.get_shape())
|
||||
return x
|
@ -0,0 +1,75 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM nvidia/cuda:9.0-base-ubuntu16.04
|
||||
|
||||
RUN echo "$LOG_TAG update and install basic packages" && \
|
||||
apt-get -y update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
libfreetype6-dev \
|
||||
libpng12-dev \
|
||||
libzmq3-dev \
|
||||
pkg-config \
|
||||
rsync \
|
||||
software-properties-common \
|
||||
unzip \
|
||||
vim \
|
||||
wget \
|
||||
&& \
|
||||
apt-get install -y locales && \
|
||||
locale-gen $LANG && \
|
||||
apt-get clean && \
|
||||
apt -y autoclean && \
|
||||
apt -y dist-upgrade && \
|
||||
apt-get install -y build-essential && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
|
||||
RUN echo "$LOG_TAG Install java8" && \
|
||||
apt-get -y update && \
|
||||
apt-get install -y openjdk-8-jdk && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Zeppelin
|
||||
ENV Z_VERSION="0.7.3" \
|
||||
Z_HOME="/zeppelin"
|
||||
|
||||
RUN echo "$LOG_TAG Download Zeppelin binary" && \
|
||||
wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
|
||||
tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
|
||||
rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
|
||||
mv /zeppelin-${Z_VERSION}-bin-all ${Z_HOME}
|
||||
ENV PATH="${Z_HOME}/bin:${PATH}"
|
||||
|
||||
RUN echo "$LOG_TAG Set locale" && \
|
||||
echo "LC_ALL=en_US.UTF-8" >> /etc/environment && \
|
||||
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen && \
|
||||
echo "LANG=en_US.UTF-8" > /etc/locale.conf && \
|
||||
locale-gen en_US.UTF-8
|
||||
|
||||
ENV LANG=en_US.UTF-8 \
|
||||
LC_ALL=en_US.UTF-8
|
||||
|
||||
COPY zeppelin-site.xml $Z_HOME/conf/zeppelin-site.xml
|
||||
COPY shiro.ini ${Z_HOME}/conf/shiro.ini
|
||||
RUN chmod 777 -R ${Z_HOME}
|
||||
|
||||
COPY run_container.sh /usr/local/bin/run_container.sh
|
||||
RUN chmod 755 /usr/local/bin/run_container.sh
|
||||
|
||||
EXPOSE 8080
|
||||
CMD ["/usr/local/bin/run_container.sh"]
|
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"${Z_HOME}/bin/zeppelin-daemon.sh" start
|
||||
while true; do
|
||||
#perform the test
|
||||
sleep 5
|
||||
done
|
@ -0,0 +1,120 @@
|
||||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
[users]
|
||||
# List of users with their password allowed to access Zeppelin.
|
||||
# To use a different strategy (LDAP / Database / ...) check the shiro doc at http://shiro.apache.org/configuration.html#Configuration-INISections
|
||||
# To enable admin user, uncomment the following line and set an appropriate password.
|
||||
admin = admin, admin
|
||||
user1 = password2, role1, role2
|
||||
user2 = password3, role3
|
||||
user3 = password4, role2
|
||||
|
||||
# Sample LDAP configuration, for user Authentication, currently tested for single Realm
|
||||
[main]
|
||||
### A sample for configuring Active Directory Realm
|
||||
#activeDirectoryRealm = org.apache.zeppelin.realm.ActiveDirectoryGroupRealm
|
||||
#activeDirectoryRealm.systemUsername = userNameA
|
||||
|
||||
#use either systemPassword or hadoopSecurityCredentialPath, more details in http://zeppelin.apache.org/docs/latest/security/shiroauthentication.html
|
||||
#activeDirectoryRealm.systemPassword = passwordA
|
||||
#activeDirectoryRealm.hadoopSecurityCredentialPath = jceks://file/user/zeppelin/zeppelin.jceks
|
||||
#activeDirectoryRealm.searchBase = CN=Users,DC=SOME_GROUP,DC=COMPANY,DC=COM
|
||||
#activeDirectoryRealm.url = ldap://ldap.test.com:389
|
||||
#activeDirectoryRealm.groupRolesMap = "CN=admin,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"admin","CN=finance,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"finance","CN=hr,OU=groups,DC=SOME_GROUP,DC=COMPANY,DC=COM":"hr"
|
||||
#activeDirectoryRealm.authorizationCachingEnabled = false
|
||||
|
||||
### A sample for configuring LDAP Directory Realm
|
||||
#ldapRealm = org.apache.zeppelin.realm.LdapGroupRealm
|
||||
## search base for ldap groups (only relevant for LdapGroupRealm):
|
||||
#ldapRealm.contextFactory.environment[ldap.searchBase] = dc=COMPANY,dc=COM
|
||||
#ldapRealm.contextFactory.url = ldap://ldap.test.com:389
|
||||
#ldapRealm.userDnTemplate = uid={0},ou=Users,dc=COMPANY,dc=COM
|
||||
#ldapRealm.contextFactory.authenticationMechanism = simple
|
||||
|
||||
### A sample PAM configuration
|
||||
#pamRealm=org.apache.zeppelin.realm.PamRealm
|
||||
#pamRealm.service=sshd
|
||||
|
||||
### A sample for configuring ZeppelinHub Realm
|
||||
#zeppelinHubRealm = org.apache.zeppelin.realm.ZeppelinHubRealm
|
||||
## Url of ZeppelinHub
|
||||
#zeppelinHubRealm.zeppelinhubUrl = https://www.zeppelinhub.com
|
||||
#securityManager.realms = $zeppelinHubRealm
|
||||
|
||||
## A same for configuring Knox SSO Realm
|
||||
#knoxJwtRealm = org.apache.zeppelin.realm.jwt.KnoxJwtRealm
|
||||
#knoxJwtRealm.providerUrl = https://domain.example.com/
|
||||
#knoxJwtRealm.login = gateway/knoxsso/knoxauth/login.html
|
||||
#knoxJwtRealm.logout = gateway/knoxssout/api/v1/webssout
|
||||
#knoxJwtRealm.logoutAPI = true
|
||||
#knoxJwtRealm.redirectParam = originalUrl
|
||||
#knoxJwtRealm.cookieName = hadoop-jwt
|
||||
#knoxJwtRealm.publicKeyPath = /etc/zeppelin/conf/knox-sso.pem
|
||||
#
|
||||
#knoxJwtRealm.groupPrincipalMapping = group.principal.mapping
|
||||
#knoxJwtRealm.principalMapping = principal.mapping
|
||||
#authc = org.apache.zeppelin.realm.jwt.KnoxAuthenticationFilter
|
||||
|
||||
sessionManager = org.apache.shiro.web.session.mgt.DefaultWebSessionManager
|
||||
|
||||
### If caching of user is required then uncomment below lines
|
||||
#cacheManager = org.apache.shiro.cache.MemoryConstrainedCacheManager
|
||||
#securityManager.cacheManager = $cacheManager
|
||||
|
||||
### Enables 'HttpOnly' flag in Zeppelin cookies
|
||||
cookie = org.apache.shiro.web.servlet.SimpleCookie
|
||||
cookie.name = JSESSIONID
|
||||
cookie.httpOnly = true
|
||||
### Uncomment the below line only when Zeppelin is running over HTTPS
|
||||
#cookie.secure = true
|
||||
sessionManager.sessionIdCookie = $cookie
|
||||
|
||||
securityManager.sessionManager = $sessionManager
|
||||
# 86,400,000 milliseconds = 24 hour
|
||||
securityManager.sessionManager.globalSessionTimeout = 86400000
|
||||
shiro.loginUrl = /api/login
|
||||
|
||||
[roles]
|
||||
role1 = *
|
||||
role2 = *
|
||||
role3 = *
|
||||
admin = *
|
||||
|
||||
[urls]
|
||||
# This section is used for url-based security. For details see the shiro.ini documentation.
|
||||
#
|
||||
# You can secure interpreter, configuration and credential information by urls.
|
||||
# Comment or uncomment the below urls that you want to hide:
|
||||
# anon means the access is anonymous.
|
||||
# authc means form based auth Security.
|
||||
#
|
||||
# IMPORTANT: Order matters: URL path expressions are evaluated against an incoming request
|
||||
# in the order they are defined and the FIRST MATCH WINS.
|
||||
#
|
||||
# To allow anonymous access to all but the stated urls,
|
||||
# uncomment the line second last line (/** = anon) and comment the last line (/** = authc)
|
||||
#
|
||||
/api/version = anon
|
||||
# Allow all authenticated users to restart interpreters on a notebook page.
|
||||
# Comment out the following line if you would like to authorize only admin users to restart interpreters.
|
||||
/api/interpreter/setting/restart/** = authc
|
||||
/api/interpreter/** = authc, roles[admin]
|
||||
/api/configurations/** = authc, roles[admin]
|
||||
/api/credential/** = authc, roles[admin]
|
||||
#/** = anon
|
||||
/** = authc
|
@ -0,0 +1,569 @@
|
||||
<?xml version="1.0"?>
|
||||
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<configuration>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.addr</name>
|
||||
<value>0.0.0.0</value>
|
||||
<description>Server address</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.port</name>
|
||||
<value>8080</value>
|
||||
<description>Server port.</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.ssl.port</name>
|
||||
<value>8443</value>
|
||||
<description>Server ssl port. (used when ssl property is set to true)</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.context.path</name>
|
||||
<value>/</value>
|
||||
<description>Context Path of the Web Application</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.war.tempdir</name>
|
||||
<value>webapps</value>
|
||||
<description>Location of jetty temporary directory</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.dir</name>
|
||||
<value>notebook</value>
|
||||
<description>path or URI for notebook persist</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.homescreen</name>
|
||||
<value></value>
|
||||
<description>id of notebook to be displayed in homescreen. ex) 2A94M5J1Z Empty value displays default home screen</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.homescreen.hide</name>
|
||||
<value>false</value>
|
||||
<description>hide homescreen notebook from list when this value set to true</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.collaborative.mode.enable</name>
|
||||
<value>true</value>
|
||||
<description>Enable collaborative mode</description>
|
||||
</property>
|
||||
|
||||
<!-- Google Cloud Storage notebook storage -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.gcs.dir</name>
|
||||
<value></value>
|
||||
<description>
|
||||
A GCS path in the form gs://bucketname/path/to/dir.
|
||||
Notes are stored at {zeppelin.notebook.gcs.dir}/{notebook-id}/note.json
|
||||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.GCSNotebookRepo</value>
|
||||
<description>notebook persistence layer implementation</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Amazon S3 notebook storage -->
|
||||
<!-- Creates the following directory structure: s3://{bucket}/{username}/{notebook-id}/note.json -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.user</name>
|
||||
<value>user</value>
|
||||
<description>user name for s3 folder structure</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.bucket</name>
|
||||
<value>zeppelin</value>
|
||||
<description>bucket name for notebook storage</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.endpoint</name>
|
||||
<value>s3.amazonaws.com</value>
|
||||
<description>endpoint for s3 bucket</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.S3NotebookRepo</value>
|
||||
<description>notebook persistence layer implementation</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Additionally, encryption is supported for notebook data stored in S3 -->
|
||||
<!-- Use the AWS KMS to encrypt data -->
|
||||
<!-- If used, the EC2 role assigned to the EMR cluster must have rights to use the given key -->
|
||||
<!-- See https://aws.amazon.com/kms/ and http://docs.aws.amazon.com/kms/latest/developerguide/concepts.html -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.kmsKeyID</name>
|
||||
<value>AWS-KMS-Key-UUID</value>
|
||||
<description>AWS KMS key ID used to encrypt notebook data in S3</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- provide region of your KMS key -->
|
||||
<!-- See http://docs.aws.amazon.com/general/latest/gr/rande.html#kms_region for region codes names -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.kmsKeyRegion</name>
|
||||
<value>us-east-1</value>
|
||||
<description>AWS KMS key region in your AWS account</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Use a custom encryption materials provider to encrypt data -->
|
||||
<!-- No configuration is given to the provider, so you must use system properties or another means to configure -->
|
||||
<!-- See https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/EncryptionMaterialsProvider.html -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.encryptionMaterialsProvider</name>
|
||||
<value>provider implementation class name</value>
|
||||
<description>Custom encryption materials provider used to encrypt notebook data in S3</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Server-side encryption enabled for notebooks -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.sse</name>
|
||||
<value>true</value>
|
||||
<description>Server-side encryption enabled for notebooks</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Optional override to control which signature algorithm should be used to sign AWS requests -->
|
||||
<!-- Set this property to "S3SignerType" if your AWS S3 compatible APIs support only AWS Signature Version 2 such as Ceph. -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.s3.signerOverride</name>
|
||||
<value>S3SignerType</value>
|
||||
<description>optional override to control which signature algorithm should be used to sign AWS requests</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- If using Azure for storage use the following settings -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.azure.connectionString</name>
|
||||
<value>DefaultEndpointsProtocol=https;AccountName=<accountName>;AccountKey=<accountKey></value>
|
||||
<description>Azure account credentials</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.azure.share</name>
|
||||
<value>zeppelin</value>
|
||||
<description>share name for notebook storage</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.azure.user</name>
|
||||
<value>user</value>
|
||||
<description>optional user name for Azure folder structure</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.AzureNotebookRepo</value>
|
||||
<description>notebook persistence layer implementation</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Notebook storage layer using local file system
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.VFSNotebookRepo</value>
|
||||
<description>local notebook persistence layer implementation</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- Notebook storage layer using hadoop compatible file system
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.FileSystemNotebookRepo</value>
|
||||
<description>Hadoop compatible file system notebook persistence layer implementation, such as local file system, hdfs, azure wasb, s3 and etc.</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.kerberos.keytab</name>
|
||||
<value></value>
|
||||
<description>keytab for accessing kerberized hdfs</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.kerberos.principal</name>
|
||||
<value></value>
|
||||
<description>principal for accessing kerberized hdfs</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- For connecting your Zeppelin with ZeppelinHub -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.GitNotebookRepo, org.apache.zeppelin.notebook.repo.zeppelinhub.ZeppelinHubRepo</value>
|
||||
<description>two notebook persistence layers (versioned local + ZeppelinHub)</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- MongoDB notebook storage -->
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.MongoNotebookRepo</value>
|
||||
<description>notebook persistence layer implementation</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.mongo.uri</name>
|
||||
<value>mongodb://localhost</value>
|
||||
<description>MongoDB connection URI used to connect to a MongoDB database server</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.mongo.database</name>
|
||||
<value>zeppelin</value>
|
||||
<description>database name for notebook storage</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.mongo.collection</name>
|
||||
<value>notes</value>
|
||||
<description>collection name for notebook storage</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.mongo.autoimport</name>
|
||||
<value>false</value>
|
||||
<description>import local notes into MongoDB automatically on startup</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.storage</name>
|
||||
<value>org.apache.zeppelin.notebook.repo.GitNotebookRepo</value>
|
||||
<description>versioned notebook persistence layer implementation</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.one.way.sync</name>
|
||||
<value>false</value>
|
||||
<description>If there are multiple notebook storages, should we treat the first one as the only source of truth?</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.dir</name>
|
||||
<value>interpreter</value>
|
||||
<description>Interpreter implementation base directory</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.localRepo</name>
|
||||
<value>local-repo</value>
|
||||
<description>Local repository for interpreter's additional dependency loading</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.dep.mvnRepo</name>
|
||||
<value>http://repo1.maven.org/maven2/</value>
|
||||
<description>Remote principal repository for interpreter's additional dependency loading</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.dep.localrepo</name>
|
||||
<value>local-repo</value>
|
||||
<description>Local repository for dependency loader</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.helium.node.installer.url</name>
|
||||
<value>https://nodejs.org/dist/</value>
|
||||
<description>Remote Node installer url for Helium dependency loader</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.helium.npm.installer.url</name>
|
||||
<value>http://registry.npmjs.org/</value>
|
||||
<description>Remote Npm installer url for Helium dependency loader</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.helium.yarnpkg.installer.url</name>
|
||||
<value>https://github.com/yarnpkg/yarn/releases/download/</value>
|
||||
<description>Remote Yarn package installer url for Helium dependency loader</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreters</name>
|
||||
<value>org.apache.zeppelin.spark.SparkInterpreter,org.apache.zeppelin.spark.PySparkInterpreter,org.apache.zeppelin.rinterpreter.RRepl,org.apache.zeppelin.rinterpreter.KnitR,org.apache.zeppelin.spark.SparkRInterpreter,org.apache.zeppelin.spark.SparkSqlInterpreter,org.apache.zeppelin.spark.DepInterpreter,org.apache.zeppelin.markdown.Markdown,org.apache.zeppelin.angular.AngularInterpreter,org.apache.zeppelin.shell.ShellInterpreter,org.apache.zeppelin.file.HDFSFileInterpreter,org.apache.zeppelin.flink.FlinkInterpreter,,org.apache.zeppelin.python.PythonInterpreter,org.apache.zeppelin.python.PythonInterpreterPandasSql,org.apache.zeppelin.python.PythonCondaInterpreter,org.apache.zeppelin.python.PythonDockerInterpreter,org.apache.zeppelin.lens.LensInterpreter,org.apache.zeppelin.ignite.IgniteInterpreter,org.apache.zeppelin.ignite.IgniteSqlInterpreter,org.apache.zeppelin.cassandra.CassandraInterpreter,org.apache.zeppelin.geode.GeodeOqlInterpreter,org.apache.zeppelin.jdbc.JDBCInterpreter,org.apache.zeppelin.kylin.KylinInterpreter,org.apache.zeppelin.elasticsearch.ElasticsearchInterpreter,org.apache.zeppelin.scalding.ScaldingInterpreter,org.apache.zeppelin.alluxio.AlluxioInterpreter,org.apache.zeppelin.hbase.HbaseInterpreter,org.apache.zeppelin.livy.LivySparkInterpreter,org.apache.zeppelin.livy.LivyPySparkInterpreter,org.apache.zeppelin.livy.LivyPySpark3Interpreter,org.apache.zeppelin.livy.LivySparkRInterpreter,org.apache.zeppelin.livy.LivySparkSQLInterpreter,org.apache.zeppelin.bigquery.BigQueryInterpreter,org.apache.zeppelin.beam.BeamInterpreter,org.apache.zeppelin.pig.PigInterpreter,org.apache.zeppelin.pig.PigQueryInterpreter,org.apache.zeppelin.scio.ScioInterpreter,org.apache.zeppelin.groovy.GroovyInterpreter</value>
|
||||
<description>Comma separated interpreter configurations. First interpreter become a default</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.group.order</name>
|
||||
<value>spark,md,angular,sh,livy,alluxio,file,psql,flink,python,ignite,lens,cassandra,geode,kylin,elasticsearch,scalding,jdbc,hbase,bigquery,beam,groovy</value>
|
||||
<description></description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.connect.timeout</name>
|
||||
<value>30000</value>
|
||||
<description>Interpreter process connect timeout in msec.</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.output.limit</name>
|
||||
<value>102400</value>
|
||||
<description>Output message from interpreter exceeding the limit will be truncated</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl</name>
|
||||
<value>false</value>
|
||||
<description>Should SSL be used by the servers?</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl.client.auth</name>
|
||||
<value>false</value>
|
||||
<description>Should client authentication be used for SSL connections?</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl.keystore.path</name>
|
||||
<value>keystore</value>
|
||||
<description>Path to keystore relative to Zeppelin configuration directory</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl.keystore.type</name>
|
||||
<value>JKS</value>
|
||||
<description>The format of the given keystore (e.g. JKS or PKCS12)</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl.keystore.password</name>
|
||||
<value>change me</value>
|
||||
<description>Keystore password. Can be obfuscated by the Jetty Password tool</description>
|
||||
</property>
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.ssl.key.manager.password</name>
|
||||
<value>change me</value>
|
||||
<description>Key Manager password. Defaults to keystore password. Can be obfuscated.</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl.truststore.path</name>
|
||||
<value>truststore</value>
|
||||
<description>Path to truststore relative to Zeppelin configuration directory. Defaults to the keystore path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.ssl.truststore.type</name>
|
||||
<value>JKS</value>
|
||||
<description>The format of the given truststore (e.g. JKS or PKCS12). Defaults to the same type as the keystore type</description>
|
||||
</property>
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.ssl.truststore.password</name>
|
||||
<value>change me</value>
|
||||
<description>Truststore password. Can be obfuscated by the Jetty Password tool. Defaults to the keystore password</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.allowed.origins</name>
|
||||
<value>*</value>
|
||||
<description>Allowed sources for REST and WebSocket requests (i.e. http://onehost:8080,http://otherhost.com). If you leave * you are vulnerable to https://issues.apache.org/jira/browse/ZEPPELIN-173</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.anonymous.allowed</name>
|
||||
<value>false</value>
|
||||
<description>Anonymous user allowed by default</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.username.force.lowercase</name>
|
||||
<value>false</value>
|
||||
<description>Force convert username case to lower case, useful for Active Directory/LDAP. Default is not to change case</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.default.owner.username</name>
|
||||
<value></value>
|
||||
<description>Set owner role by default</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.public</name>
|
||||
<value>true</value>
|
||||
<description>Make notebook public by default when created, private otherwise</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.websocket.max.text.message.size</name>
|
||||
<value>1024000</value>
|
||||
<description>Size in characters of the maximum text message to be received by websocket. Defaults to 1024000</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.default.dir.allowed</name>
|
||||
<value>false</value>
|
||||
<description>Enable directory listings on server.</description>
|
||||
</property>
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.interpreter.lifecyclemanager.class</name>
|
||||
<value>org.apache.zeppelin.interpreter.lifecycle.TimeoutLifecycleManager</value>
|
||||
<description>LifecycleManager class for managing the lifecycle of interpreters, by default interpreter will
|
||||
be closed after timeout</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.lifecyclemanager.timeout.checkinterval</name>
|
||||
<value>60000</value>
|
||||
<description>Milliseconds of the interval to checking whether interpreter is time out</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.interpreter.lifecyclemanager.timeout.threshold</name>
|
||||
<value>3600000</value>
|
||||
<description>Milliseconds of the interpreter timeout threshold, by default it is 1 hour</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.server.jetty.name</name>
|
||||
<value>Jetty(7.6.0.v20120127)</value>
|
||||
<description>Hardcoding Application Server name to Prevent Fingerprinting</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.server.jetty.request.header.size</name>
|
||||
<value>8192</value>
|
||||
<description>Http Request Header Size Limit (to prevent HTTP 413)</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.server.xframe.options</name>
|
||||
<value>SAMEORIGIN</value>
|
||||
<description>The X-Frame-Options HTTP response header can be used to indicate whether or not a browser should be allowed to render a page in a frame/iframe/object.</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.server.strict.transport</name>
|
||||
<value>max-age=631138519</value>
|
||||
<description>The HTTP Strict-Transport-Security response header is a security feature that lets a web site tell browsers that it should only be communicated with using HTTPS, instead of using HTTP. Enable this when Zeppelin is running on HTTPS. Value is in Seconds, the default value is equivalent to 20 years.</description>
|
||||
</property>
|
||||
-->
|
||||
<!--
|
||||
|
||||
<property>
|
||||
<name>zeppelin.server.xxss.protection</name>
|
||||
<value>1</value>
|
||||
<description>The HTTP X-XSS-Protection response header is a feature of Internet Explorer, Chrome and Safari that stops pages from loading when they detect reflected cross-site scripting (XSS) attacks. When value is set to 1 and a cross-site scripting attack is detected, the browser will sanitize the page (remove the unsafe parts).</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.interpreter.callback.portRange</name>
|
||||
<value>10000:10010</value>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.recovery.storage.class</name>
|
||||
<value>org.apache.zeppelin.interpreter.recovery.FileSystemRecoveryStorage</value>
|
||||
<description>ReoveryStorage implementation</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!--
|
||||
<property>
|
||||
<name>zeppelin.recovery.dir</name>
|
||||
<value>recovery</value>
|
||||
<description>Location where recovery metadata is stored</description>
|
||||
</property>
|
||||
-->
|
||||
|
||||
<!-- GitHub configurations
|
||||
<property>
|
||||
<name>zeppelin.notebook.git.remote.url</name>
|
||||
<value></value>
|
||||
<description>remote Git repository URL</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.git.remote.username</name>
|
||||
<value>token</value>
|
||||
<description>remote Git repository username</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.git.remote.access-token</name>
|
||||
<value></value>
|
||||
<description>remote Git repository password</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.git.remote.origin</name>
|
||||
<value>origin</value>
|
||||
<description>Git repository remote</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>zeppelin.notebook.cron.enable</name>
|
||||
<value>false</value>
|
||||
<description>Notebook enable cron scheduler feature</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>zeppelin.notebook.cron.folders</name>
|
||||
<value></value>
|
||||
<description>Notebook cron folders</description>
|
||||
</property>
|
||||
-->
|
||||
</configuration>
|
@ -12,9 +12,7 @@
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
# Developper Guide
|
||||
|
||||
(Need add more details)
|
||||
# Developer Guide
|
||||
|
||||
By default, submarine uses YARN service framework as runtime. If you want to add your own implementation. You can add a new `RuntimeFactory` implementation and configure following option to `submarine.xml` (which should be placed under same `$HADOOP_CONF_DIR`)
|
||||
|
@ -0,0 +1,21 @@
|
||||
<!---
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
# Examples
|
||||
|
||||
Here're some examples about Submarine usage.
|
||||
|
||||
[Running Distributed CIFAR 10 Tensorflow Job](RunningDistributedCifar10TFJobs.html)
|
||||
|
||||
[Running Zeppelin Notebook on YARN](RunningZeppelinOnYARN.html)
|
@ -0,0 +1,42 @@
|
||||
<!---
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
Submarine is a project which allows infra engineer / data scientist to run *unmodified* Tensorflow programs on YARN.
|
||||
|
||||
Goals of Submarine:
|
||||
|
||||
- It allows jobs for easy access to data/models in HDFS and other storages.
|
||||
|
||||
- Can launch services to serve Tensorflow/MXNet models.
|
||||
|
||||
- Support run distributed Tensorflow jobs with simple configs.
|
||||
|
||||
- Support run user-specified Docker images.
|
||||
|
||||
- Support specify GPU and other resources.
|
||||
|
||||
- Support launch tensorboard for training jobs if user specified.
|
||||
|
||||
- Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
|
||||
|
||||
|
||||
Click below contents if you want to understand more.
|
||||
|
||||
- [QuickStart Guide](QuickStart.html)
|
||||
|
||||
- [Examples](Examples.html)
|
||||
|
||||
- [How to write Dockerfile for Submarine jobs](WriteDockerfile.html)
|
||||
|
||||
- [Developer guide](DeveloperGuide.html)
|
@ -17,11 +17,13 @@
|
||||
## Prerequisite
|
||||
|
||||
Must:
|
||||
- Apache Hadoop 3.1.0, YARN service enabled.
|
||||
|
||||
- Apache Hadoop 3.1.x, YARN service enabled.
|
||||
|
||||
Optional:
|
||||
- Enable YARN DNS. (When distributed training required.)
|
||||
- Enable GPU on YARN support. (When GPU-based training required.)
|
||||
|
||||
- Enable YARN DNS. (When distributed training is required.)
|
||||
- Enable GPU on YARN support. (When GPU-based training is required.)
|
||||
|
||||
## Run jobs
|
||||
|
||||
@ -81,15 +83,26 @@ yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job run \
|
||||
--input_path hdfs://default/dataset/cifar-10-data \
|
||||
--checkpoint_path hdfs://default/tmp/cifar-10-jobdir \
|
||||
--worker_resources memory=4G,vcores=2,gpu=2 \
|
||||
--worker_launch_cmd "python ... (Your training application cmd)"
|
||||
--worker_launch_cmd "python ... (Your training application cmd)" \
|
||||
--tensorboard # this will launch a companion tensorboard container for monitoring
|
||||
```
|
||||
|
||||
#### Notes:
|
||||
|
||||
1) `DOCKER_JAVA_HOME` points to JAVA_HOME inside Docker image.
|
||||
|
||||
2) `DOCKER_HADOOP_HDFS_HOME` points to HADOOP_HDFS_HOME inside Docker image.
|
||||
|
||||
3) `--worker_resources` can include gpu when you need GPU to train your task.
|
||||
|
||||
4) When `--tensorboard` is specified, you can go to YARN new UI, go to services -> `<you specified service>` -> Click `...` to access Tensorboard.
|
||||
|
||||
This will launch a Tensorboard to monitor *all your jobs*. By access YARN UI (the new UI). You can go to services page, go to the `tensorboard-service`, click quick links (`Tensorboard`) can lead you to the tensorboard.
|
||||
|
||||
See below screenshot:
|
||||
|
||||
![alt text](./images/tensorboard-service.png "Tensorboard service")
|
||||
|
||||
### Launch Distributed Tensorflow Application:
|
||||
|
||||
#### Commandline
|
||||
@ -110,12 +123,14 @@ yarn jar hadoop-yarn-applications-submarine-<version>.jar job run \
|
||||
#### Notes:
|
||||
|
||||
1) Very similar to standalone TF application, but you need to specify #worker/#ps
|
||||
|
||||
2) Different resources can be specified for worker and PS.
|
||||
|
||||
3) `TF_CONFIG` environment will be auto generated and set before executing user's launch command.
|
||||
|
||||
## Run jobs
|
||||
## Get job history / logs
|
||||
|
||||
### Get Job Status
|
||||
### Get Job Status from CLI
|
||||
|
||||
```
|
||||
yarn jar hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar job show --name tf-job-001
|
||||
@ -131,4 +146,29 @@ Job Meta Info:
|
||||
(... all your commandline before run the job)
|
||||
```
|
||||
|
||||
After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job.
|
||||
After that, you can run ```tensorboard --logdir=<checkpoint-path>``` to view Tensorboard of the job.
|
||||
|
||||
### Run tensorboard to monitor your jobs
|
||||
|
||||
```
|
||||
# Cleanup previous service if needed
|
||||
yarn app -destroy tensorboard-service; \
|
||||
yarn jar /tmp/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
|
||||
job run --name tensorboard-service --verbose --docker_image wtan/tf-1.8.0-cpu:0.0.3 \
|
||||
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ \
|
||||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0 \
|
||||
--num_workers 0 --tensorboard
|
||||
```
|
||||
|
||||
You can view multiple job training history like from the `Tensorboard` link:
|
||||
|
||||
![alt text](./images/multiple-tensorboard-jobs.png "Tensorboard for multiple jobs")
|
||||
|
||||
|
||||
### Get component logs from a training job
|
||||
|
||||
There're two ways to get training job logs, one is from YARN UI (new or old):
|
||||
|
||||
![alt text](./images/job-logs-ui.png "Job logs UI")
|
||||
|
||||
Or you can use `yarn logs -applicationId <applicationId>` to get logs from CLI
|
@ -0,0 +1,162 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
# Tutorial: Running Distributed Cifar10 Tensorflow Estimator Example.
|
||||
|
||||
## Prepare data for training
|
||||
|
||||
CIFAR-10 is a common benchmark in machine learning for image recognition. Below example is based on CIFAR-10 dataset.
|
||||
|
||||
1) Checkout https://github.com/tensorflow/models/:
|
||||
```
|
||||
git clone https://github.com/tensorflow/models/
|
||||
```
|
||||
|
||||
2) Go to `models/tutorials/image/cifar10_estimator`
|
||||
|
||||
3) Generate data by using following command: (required Tensorflow installed)
|
||||
|
||||
```
|
||||
python generate_cifar10_tfrecords.py --data-dir=cifar-10-data
|
||||
```
|
||||
|
||||
4) Upload data to HDFS
|
||||
|
||||
```
|
||||
hadoop fs -put cifar-10-data/ /dataset/cifar-10-data
|
||||
```
|
||||
|
||||
**Please note that:**
|
||||
|
||||
YARN service doesn't allow multiple services with the same name, so please run following command
|
||||
```
|
||||
yarn application -destroy <service-name>
|
||||
```
|
||||
to delete services if you want to reuse the same service name.
|
||||
|
||||
## Prepare Docker images
|
||||
|
||||
Refer to [Write Dockerfile](WriteDockerfile.md) to build a Docker image or use prebuilt one.
|
||||
|
||||
## Run Tensorflow jobs
|
||||
|
||||
### Run standalone training
|
||||
|
||||
```
|
||||
yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
|
||||
job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \
|
||||
--input_path hdfs://default/dataset/cifar-10-data \
|
||||
--env DOCKER_JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/
|
||||
--env DOCKER_HADOOP_HDFS_HOME=/hadoop-3.1.0
|
||||
--num_workers 1 --worker_resources memory=8G,vcores=2,gpu=1 \
|
||||
--worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \
|
||||
--tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3
|
||||
```
|
||||
|
||||
Explanations:
|
||||
|
||||
- When access of HDFS is required, the two environments are required to indicate: JAVA_HOME and HDFS_HOME to access libhdfs libraries *inside Docker image*. We will try to eliminate specifying this in the future.
|
||||
- Docker image for worker and tensorboard can be specified separately. For this case, Tensorboard doesn't need GPU, so we will use cpu Docker image for Tensorboard. (Same for parameter-server in the distributed example below).
|
||||
|
||||
### Run distributed training
|
||||
|
||||
```
|
||||
yarn jar path/to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
|
||||
job run --name tf-job-001 --verbose --docker_image hadoopsubmarine/tf-1.8.0-gpu:0.0.1 \
|
||||
--input_path hdfs://default/dataset/cifar-10-data \
|
||||
--env(s) (same as standalone)
|
||||
--num_workers 2 \
|
||||
--worker_resources memory=8G,vcores=2,gpu=1 \
|
||||
--worker_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --train-steps=10000 --eval-batch-size=16 --train-batch-size=16 --num-gpus=2 --sync" \
|
||||
--ps_docker_image wtan/tf-1.8.0-cpu:0.0.3 \
|
||||
--num_ps 1 --ps_resources memory=4G,vcores=2,gpu=0 \
|
||||
--ps_launch_cmd "cd /test/models/tutorials/image/cifar10_estimator && python cifar10_main.py --data-dir=%input_path% --job-dir=%checkpoint_path% --num-gpus=0" \
|
||||
--tensorboard --tensorboard_docker_image wtan/tf-1.8.0-cpu:0.0.3
|
||||
```
|
||||
|
||||
Explanations:
|
||||
|
||||
- `>1` num_workers indicates it is a distributed training.
|
||||
- Parameters / resources / Docker image of parameter server can be specified separately. For many cases, parameter server doesn't require GPU.
|
||||
|
||||
*Outputs of distributed training*
|
||||
|
||||
Sample output of master:
|
||||
```
|
||||
...
|
||||
allow_soft_placement: true
|
||||
, '_tf_random_seed': None, '_task_type': u'master', '_environment': u'cloud', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe77cb15050>, '_tf_config': gpu_options {
|
||||
per_process_gpu_memory_fraction: 1.0
|
||||
}
|
||||
...
|
||||
2018-05-06 22:29:14.656022: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> localhost:8000}
|
||||
2018-05-06 22:29:14.656097: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000}
|
||||
2018-05-06 22:29:14.656112: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000}
|
||||
2018-05-06 22:29:14.659359: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
|
||||
...
|
||||
INFO:tensorflow:Restoring parameters from hdfs://default/tmp/cifar-10-jobdir/model.ckpt-0
|
||||
INFO:tensorflow:Evaluation [1/625]
|
||||
INFO:tensorflow:Evaluation [2/625]
|
||||
INFO:tensorflow:Evaluation [3/625]
|
||||
INFO:tensorflow:Evaluation [4/625]
|
||||
INFO:tensorflow:Evaluation [5/625]
|
||||
INFO:tensorflow:Evaluation [6/625]
|
||||
...
|
||||
INFO:tensorflow:Validation (step 1): loss = 1220.6445, global_step = 1, accuracy = 0.1
|
||||
INFO:tensorflow:loss = 6.3980675, step = 0
|
||||
INFO:tensorflow:loss = 6.3980675, learning_rate = 0.1
|
||||
INFO:tensorflow:global_step/sec: 2.34092
|
||||
INFO:tensorflow:Average examples/sec: 1931.22 (1931.22), step = 100
|
||||
INFO:tensorflow:Average examples/sec: 354.236 (38.6479), step = 110
|
||||
INFO:tensorflow:Average examples/sec: 211.096 (38.7693), step = 120
|
||||
INFO:tensorflow:Average examples/sec: 156.533 (38.1633), step = 130
|
||||
INFO:tensorflow:Average examples/sec: 128.6 (38.7372), step = 140
|
||||
INFO:tensorflow:Average examples/sec: 111.533 (39.0239), step = 150
|
||||
```
|
||||
|
||||
Sample output of worker:
|
||||
```
|
||||
, '_tf_random_seed': None, '_task_type': u'worker', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc2a490b050>, '_tf_config': gpu_options {
|
||||
per_process_gpu_memory_fraction: 1.0
|
||||
}
|
||||
...
|
||||
2018-05-06 22:28:45.807936: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000}
|
||||
2018-05-06 22:28:45.808040: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> ps-0.distributed-tf.root.tensorflow.site:8000}
|
||||
2018-05-06 22:28:45.808064: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> localhost:8000}
|
||||
2018-05-06 22:28:45.809919: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
|
||||
...
|
||||
INFO:tensorflow:loss = 5.319096, step = 0
|
||||
INFO:tensorflow:loss = 5.319096, learning_rate = 0.1
|
||||
INFO:tensorflow:Average examples/sec: 49.2338 (49.2338), step = 10
|
||||
INFO:tensorflow:Average examples/sec: 52.117 (55.3589), step = 20
|
||||
INFO:tensorflow:Average examples/sec: 53.2754 (55.7541), step = 30
|
||||
INFO:tensorflow:Average examples/sec: 53.8388 (55.6028), step = 40
|
||||
INFO:tensorflow:Average examples/sec: 54.1082 (55.2134), step = 50
|
||||
INFO:tensorflow:Average examples/sec: 54.3141 (55.3676), step = 60
|
||||
```
|
||||
|
||||
Sample output of ps:
|
||||
```
|
||||
...
|
||||
, '_tf_random_seed': None, '_task_type': u'ps', '_environment': u'cloud', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4be54dff90>, '_tf_config': gpu_options {
|
||||
per_process_gpu_memory_fraction: 1.0
|
||||
}
|
||||
...
|
||||
2018-05-06 22:28:42.562316: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-0.distributed-tf.root.tensorflow.site:8000}
|
||||
2018-05-06 22:28:42.562408: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
|
||||
2018-05-06 22:28:42.562433: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-0.distributed-tf.root.tensorflow.site:8000}
|
||||
2018-05-06 22:28:42.564242: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
|
||||
```
|
@ -0,0 +1,37 @@
|
||||
<!---
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
|
||||
# Running Zeppelin Notebook On Submarine
|
||||
|
||||
This is a simple example about how to run Zeppelin notebook by using Submarine.
|
||||
|
||||
## Step 1: Build Docker Image
|
||||
|
||||
Go to `src/main/docker/zeppelin-notebook-example`, build the Docker image. Or you can use the prebuilt one: `hadoopsubmarine/zeppelin-on-yarn-gpu:0.0.1`
|
||||
|
||||
## Step 2: Launch the notebook on YARN
|
||||
|
||||
Submit command to YARN:
|
||||
|
||||
`yarn app -destroy zeppelin-notebook;
|
||||
yarn jar path-to/hadoop-yarn-applications-submarine-3.2.0-SNAPSHOT.jar \
|
||||
job run --name zeppelin-notebook \
|
||||
--docker_image hadoopsubmarine/zeppelin-on-yarn-gpu:0.0.1 \
|
||||
--worker_resources memory=8G,vcores=2,gpu=1 \
|
||||
--num_workers 1 \
|
||||
-worker_launch_cmd "/usr/local/bin/run_container.sh"`
|
||||
|
||||
Once the container got launched, you can go to `YARN services` UI page, access the `zeppelin-notebook` job, and go to the quicklink `notebook` by clicking `...`.
|
||||
|
||||
The notebook is secured by admin/admin user name and password.
|
@ -0,0 +1,117 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# Creating Docker Images for Running Tensorflow on YARN
|
||||
|
||||
## How to create docker images to run Tensorflow on YARN
|
||||
|
||||
Dockerfile to run Tensorflow on YARN need two part:
|
||||
|
||||
**Base libraries which Tensorflow depends on**
|
||||
|
||||
1) OS base image, for example ```ubuntu:16.04```
|
||||
|
||||
2) Tensorflow depended libraries and packages. For example ```python```, ```scipy```. For GPU support, need ```cuda```, ```cudnn```, etc.
|
||||
|
||||
3) Tensorflow package.
|
||||
|
||||
**Libraries to access HDFS**
|
||||
|
||||
1) JDK
|
||||
|
||||
2) Hadoop
|
||||
|
||||
Here's an example of a base image (w/o GPU support) to install Tensorflow:
|
||||
```
|
||||
FROM ubuntu:16.04
|
||||
|
||||
# Pick up some TF dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
libfreetype6-dev \
|
||||
libpng12-dev \
|
||||
libzmq3-dev \
|
||||
pkg-config \
|
||||
python \
|
||||
python-dev \
|
||||
rsync \
|
||||
software-properties-common \
|
||||
unzip \
|
||||
&& \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
Pillow \
|
||||
h5py \
|
||||
ipykernel \
|
||||
jupyter \
|
||||
matplotlib \
|
||||
numpy \
|
||||
pandas \
|
||||
scipy \
|
||||
sklearn \
|
||||
&& \
|
||||
python -m ipykernel.kernelspec
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
|
||||
```
|
||||
|
||||
On top of above image, add files, install packages to access HDFS
|
||||
```
|
||||
RUN apt-get update && apt-get install -y openjdk-8-jdk wget
|
||||
RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
|
||||
RUN tar zxf hadoop-3.1.0.tar.gz
|
||||
```
|
||||
|
||||
Build and push to your own docker registry: Use ```docker build ... ``` and ```docker push ...``` to finish this step.
|
||||
|
||||
## Use examples to build your own Tensorflow docker images
|
||||
|
||||
We provided following examples for you to build tensorflow docker images.
|
||||
|
||||
For Tensorflow 1.8.0 (Precompiled to CUDA 9.x)
|
||||
|
||||
- *docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only.
|
||||
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0*: Tensorflow 1.8.0 supports CPU only, and included models
|
||||
- *docker/base/ubuntu-16.04/Dockerfile.gpu.cuda_9.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9.
|
||||
- *docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.cuda_8.0.tf_1.8.0*: Tensorflow 1.8.0 supports GPU, which is prebuilt to CUDA9, with models.
|
||||
|
||||
## Build Docker images
|
||||
|
||||
### Manually build Docker image:
|
||||
|
||||
Under `docker/` directory, run `build-all.sh` to build Docker images. It will build following images:
|
||||
|
||||
- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop, Tensorflow, GPU base libraries.
|
||||
- `tf-1.8.0-gpu-base:0.0.1` for base Docker image which includes Hadoop. Tensorflow.
|
||||
- `tf-1.8.0-gpu:0.0.1` which includes cifar10 model
|
||||
- `tf-1.8.0-cpu:0.0.1` which inclues cifar10 model (cpu only).
|
||||
|
||||
### Use prebuilt images
|
||||
|
||||
(No liability)
|
||||
You can also use prebuilt images for convenience:
|
||||
|
||||
- hadoopsubmarine/tf-1.8.0-gpu:0.0.1
|
||||
- hadoopsubmarine/tf-1.8.0-cpu:0.0.1
|
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#banner {
|
||||
height: 93px;
|
||||
background: none;
|
||||
}
|
||||
|
||||
#bannerLeft img {
|
||||
margin-left: 30px;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
#bannerRight img {
|
||||
margin: 17px;
|
||||
}
|
Binary file not shown.
After Width: | Height: | Size: 225 KiB |
Binary file not shown.
After Width: | Height: | Size: 180 KiB |
Binary file not shown.
After Width: | Height: | Size: 105 KiB |
@ -0,0 +1,28 @@
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
<project name="Apache Hadoop ${project.version}">
|
||||
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-stylus-skin</artifactId>
|
||||
<version>${maven-stylus-skin.version}</version>
|
||||
</skin>
|
||||
|
||||
<body>
|
||||
<links>
|
||||
<item name="Apache Hadoop" href="http://hadoop.apache.org/"/>
|
||||
</links>
|
||||
</body>
|
||||
|
||||
</project>
|
Loading…
x
Reference in New Issue
Block a user