Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File metrics collector end to end test #832

Merged
merged 1 commit into from
Sep 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/v1alpha3/file-metrics-collector/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime

WORKDIR /var
ADD mnist.py /var

ENTRYPOINT ["python", "/var/mnist.py"]
148 changes: 148 additions & 0 deletions examples/v1alpha3/file-metrics-collector/mnist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from __future__ import print_function

import argparse
import logging
import os

from torchvision import datasets, transforms
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

WORLD_SIZE = int(os.environ.get('WORLD_SIZE', 1))

logging.basicConfig(filename='/katib/mnist.log', level=logging.DEBUG)

class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.fc2 = nn.Linear(500, 10)

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)

def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
msg = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item())
print(msg)
logging.debug(msg)
niter = epoch * len(train_loader) + batch_idx

def test(args, model, device, test_loader, epoch):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)
logging.info('\naccuracy={:.4f}\n'.format(float(correct) / len(test_loader.dataset)))


def should_distribute():
return dist.is_available() and WORLD_SIZE > 1


def is_distributed():
return dist.is_available() and dist.is_initialized()


def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
if dist.is_available():
parser.add_argument('--backend', type=str, help='Distributed backend',
choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI],
default=dist.Backend.GLOO)
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
if use_cuda:
print('Using CUDA')

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

if should_distribute():
print('Using distributed PyTorch with {} backend'.format(args.backend))
dist.init_process_group(backend=args.backend)

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=False, **kwargs)

model = Net().to(device)

if is_distributed():
Distributor = nn.parallel.DistributedDataParallel if use_cuda \
else nn.parallel.DistributedDataParallelCPU
model = Distributor(model)

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(args, model, device, test_loader, epoch)

if (args.save_model):
torch.save(model.state_dict(),"mnist_cnn.pt")

if __name__ == '__main__':
main()

60 changes: 60 additions & 0 deletions examples/v1alpha3/file-metricscollector-example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
apiVersion: "kubeflow.org/v1alpha3"
kind: Experiment
metadata:
namespace: kubeflow
labels:
controller-tools.k8s.io: "1.0"
name: file-metricscollector-example
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: accuracy
metricsCollectorSpec:
source:
fileSystemPath:
path: "/katib/mnist.log"
kind: File
collector:
kind: File
algorithm:
algorithmName: random
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
parameters:
- name: --lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: --momentum
parameterType: double
feasibleSpace:
min: "0.3"
max: "0.7"
trialTemplate:
goTemplate:
rawTemplate: |-
apiVersion: batch/v1
kind: Job
metadata:
name: {{.Trial}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.Trial}}
image: docker.io/liuhougangxa/pytorch-mnist:1.0
imagePullPolicy: Always
command:
- "python"
- "/var/mnist.py"
- "--epochs=1"
{{- with .HyperParameters}}
{{- range .}}
- "{{.Name}}={{.Value}}"
{{- end}}
{{- end}}
restartPolicy: Never
14 changes: 0 additions & 14 deletions examples/v1alpha3/tfevent-volume/tfevent-pv.yaml

This file was deleted.

14 changes: 0 additions & 14 deletions examples/v1alpha3/tfevent-volume/tfevent-pvc.yaml

This file was deleted.

64 changes: 64 additions & 0 deletions test/scripts/v1alpha3/run-file-metricscollector.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

# Copyright 2018 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This shell script is used to build a cluster and create a namespace from our
# argo workflow

set -o errexit
set -o nounset
set -o pipefail

CLUSTER_NAME="${CLUSTER_NAME}"
ZONE="${GCP_ZONE}"
PROJECT="${GCP_PROJECT}"
NAMESPACE="${DEPLOY_NAMESPACE}"
REGISTRY="${GCP_REGISTRY}"
GO_DIR=${GOPATH}/src/github.com/${REPO_OWNER}/${REPO_NAME}

echo "Activating service-account"
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

echo "Configuring kubectl"

echo "CLUSTER_NAME: ${CLUSTER_NAME}"
echo "ZONE: ${GCP_ZONE}"
echo "PROJECT: ${GCP_PROJECT}"

gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} \
--zone ${ZONE}
kubectl config set-context $(kubectl config current-context) --namespace=default
USER=`gcloud config get-value account`

echo "All Katib components are running."
kubectl version
kubectl cluster-info
echo "Katib deployments"
kubectl -n kubeflow get deploy
echo "Katib services"
kubectl -n kubeflow get svc
echo "Katib pods"
kubectl -n kubeflow get pod

cd ${GO_DIR}/test/e2e/v1alpha3

echo "Running e2e file metricscollector experiment"
export KUBECONFIG=$HOME/.kube/config
./run-e2e-experiment ../../../examples/v1alpha3/file-metricscollector-example.yaml
kubectl -n kubeflow describe suggestion
kubectl delete -f ../../../examples/v1alpha3/file-metricscollector-example.yaml
kubectl describe pods
kubectl describe deploy
exit 0
7 changes: 7 additions & 0 deletions test/workflows/components/workflows-v1alpha3.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@
name: "run-grid-e2e-tests",
template: "run-grid-e2e-tests",
},
{
name: "run-file-metricscollector-e2e-tests",
template: "run-file-metricscollector-e2e-tests",
},
{
name: "run-bayesian-e2e-tests",
template: "run-bayesian-e2e-tests",
Expand Down Expand Up @@ -355,6 +359,9 @@
$.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("run-bayesian-e2e-tests", testWorkerImage, [
"test/scripts/v1alpha3/run-suggestion-bayesian.sh",
]), // run bayesian algorithm
$.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("run-file-metricscollector-e2e-tests", testWorkerImage, [
"test/scripts/v1alpha3/run-file-metricscollector.sh",
]), // run file metrics collector test
$.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", testWorkerImage, [
"python",
"-m",
Expand Down