ib-test-b200.yaml

---
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
  name: nccl-test
spec:
  slotsPerWorker: 8
  launcherCreationPolicy: WaitForWorkersReady
  runPolicy:
    cleanPodPolicy: Running

  # Mount MPI Operator's SSH key where the user actually is
  # We'll run as root, so point to /root/.ssh
  sshAuthMountPath: /root/.ssh

  mpiReplicaSpecs:
    Launcher:
      replicas: 1
      template:
        spec:
          restartPolicy: OnFailure
          containers:
            - name: mpi-launcher
              image: docker.io/deepops/nccl-tests:2312
              # Run as root to avoid the "uid 1000" user lookup error
              securityContext:
                runAsUser: 0
              env:
                - name: OMPI_ALLOW_RUN_AS_ROOT
                  value: "1"
                - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
                  value: "1"
              command: ["/bin/bash","-lc"]
              args:
                - >
                  mpirun --allow-run-as-root
                  -np 16
                  -bind-to none -map-by slot
                  -mca pml ob1
                  -mca btl ^openib
                  -mca btl_tcp_if_include 192.168.0.0/16
                  -mca oob_tcp_if_include 172.29.0.0/16
                  -mca routed direct
                  -mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i /root/.ssh/id_rsa"
                  all_reduce_perf_mpi -b 16 -e 16G -f 2 -g 1

    Worker:
      replicas: 2
      template:
        metadata:
          labels:
            mpi-memcpy-dra-test-replica: mpi-worker
          annotations:
            # Your SR-IOV RDMA networks
            k8s.v1.cni.cncf.io/networks: "sriovibnet-rdma-default-a-su-1,sriovibnet-rdma-default-b-su-1,sriovibnet-rdma-default-c-su-1,sriovibnet-rdma-default-d-su-1,sriovibnet-rdma-default-e-su-1,sriovibnet-rdma-default-f-su-1,sriovibnet-rdma-default-g-su-1,sriovibnet-rdma-default-h-su-1"
        spec:
          restartPolicy: OnFailure
          containers:
            - name: mpi-worker
              image: docker.io/deepops/nccl-tests:2312
              # Root to generate host keys & read /etc/ssh/*
              securityContext:
                runAsUser: 0
                capabilities:
                  add: ["IPC_LOCK"]
              command: ["/bin/bash","-lc"]
              args:
                - >
                  ssh-keygen -A &&
                  exec /usr/sbin/sshd -De -p 2222
              resources:
                limits:
                  nvidia.com/gpu: 8
                  nvidia.com/sriovib_resource_a: "1"
                  nvidia.com/sriovib_resource_b: "1"
                  nvidia.com/sriovib_resource_c: "1"
                  nvidia.com/sriovib_resource_d: "1"
                  nvidia.com/sriovib_resource_e: "1"
                  nvidia.com/sriovib_resource_f: "1"
                  nvidia.com/sriovib_resource_g: "1"
                  nvidia.com/sriovib_resource_h: "1"

Last updated