Distributed

Distributed Training, is the ability to split the training of a model among multiple processors. It is often a necessity when multi-GPU training no longer applies; typically when you require more GPUs than exist on a single node. Each such split is a pod (see definition above). NVIDIA Run:ai spawns an additional launcher process that manages and coordinates the other worker pods. For more information, see Distributed training.

Create a distributed training.

post

Use to create a distributed training.

Authorizations

AuthorizationstringRequired

Bearer authentication

Body

and

Responses

202

Request completed successfully.

application/json

400

Bad submission request.

application/json

401

Unauthorized

application/json

403

Forbidden

application/json

503

unexpected error

application/json

post

/api/v1/workloads/distributed

POST /api/v1/workloads/distributed HTTP/1.1
Host: app.run.ai
Authorization: Bearer YOUR_SECRET_TOKEN
Content-Type: application/json
Accept: */*
Content-Length: 9514

{
  "name": "my-workload-name",
  "useGivenNameAsPrefix": true,
  "projectId": 1,
  "clusterId": "71f69d83-ba66-4822-adf5-55ce55efd210",
  "templateId": "550e8400-e29b-41d4-a716-446655440000",
  "spec": {
    "annotations": [
      {
        "name": "billing",
        "value": "my-billing-unit",
        "exclude": false
      }
    ],
    "args": "-x my-script.py",
    "autoDeletionTimeAfterCompletionSeconds": 15,
    "backoffLimit": 3,
    "category": "text",
    "cleanPodPolicy": "None",
    "command": "python",
    "compute": {
      "cpuCoreLimit": 2,
      "cpuCoreRequest": 0.5,
      "cpuMemoryLimit": "30M",
      "cpuMemoryRequest": "20M",
      "extendedResources": [
        {
          "resource": "hardware-vendor.example/foo",
          "quantity": 2,
          "exclude": false
        }
      ],
      "gpuDevicesRequest": 1,
      "gpuMemoryLimit": "10M",
      "gpuMemoryRequest": "10M",
      "gpuPortionLimit": 0.5,
      "gpuPortionRequest": 0.5,
      "gpuRequestType": "portion",
      "largeShmRequest": false
    },
    "createHomeDir": true,
    "distributedFramework": "MPI",
    "environmentVariables": [
      {
        "name": "HOME",
        "value": "/home/my-folder",
        "secret": {
          "name": "postgress_secret",
          "key": "POSTGRES_PASSWORD"
        },
        "configMap": {
          "name": "my-config-map",
          "key": "MY_POSTGRES_SCHEMA"
        },
        "podFieldRef": {
          "path": "metadata.name"
        },
        "userCredential": {
          "name": "my_postgres_user_and_password",
          "key": "POSTGRES_PASSWORD"
        },
        "exclude": false,
        "description": "Home directory of the user."
      }
    ],
    "exposedUrls": [
      {
        "container": 8080,
        "url": "https://my-url.com",
        "authorizationType": "authenticatedUsers",
        "authorizedUsers": [
          "user-a",
          "user-b"
        ],
        "authorizedGroups": [
          "group-a",
          "group-b"
        ],
        "toolType": "jupyter",
        "toolName": "my-pytorch",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "image": "python:3.8",
    "imagePullPolicy": "Always",
    "imagePullSecrets": [
      {
        "name": "text",
        "userCredential": true,
        "exclude": false
      }
    ],
    "labels": [
      {
        "name": "stage",
        "value": "initial-research",
        "exclude": false
      }
    ],
    "maxReplicas": 1,
    "minReplicas": 1,
    "mpiLauncherCreationPolicy": "AtStartup",
    "nodeAffinityRequired": {
      "nodeSelectorTerms": [
        {
          "matchExpressions": [
            {
              "key": "text",
              "operator": "In",
              "values": [
                "text"
              ]
            }
          ]
        }
      ]
    },
    "nodePools": [
      "my-node-pool-a",
      "my-node-pool-b"
    ],
    "nodeType": "my-node-type",
    "numWorkers": 1,
    "podAffinity": {
      "type": "Required",
      "key": "text"
    },
    "ports": [
      {
        "container": 8080,
        "serviceType": "LoadBalancer",
        "external": 30080,
        "toolType": "pytorch",
        "toolName": "my-pytorch",
        "name": "port-instance-a",
        "exclude": false
      }
    ],
    "preemptibility": "preemptible",
    "priorityClass": "text",
    "probes": {
      "readiness": {
        "initialDelaySeconds": 1,
        "periodSeconds": 1,
        "timeoutSeconds": 1,
        "successThreshold": 1,
        "failureThreshold": 1,
        "handler": {
          "httpGet": {
            "path": "/",
            "port": 1,
            "host": "example.com",
            "scheme": "HTTP"
          }
        }
      }
    },
    "relatedUrls": [
      {
        "url": "https://my-url.com",
        "type": "wandb",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "restartPolicy": "Always",
    "security": {
      "allowPrivilegeEscalation": false,
      "capabilities": [
        "CHOWN",
        "KILL"
      ],
      "hostIpc": false,
      "hostNetwork": false,
      "readOnlyRootFilesystem": false,
      "runAsGid": 30,
      "runAsNonRoot": true,
      "runAsUid": 500,
      "seccompProfileType": "RuntimeDefault",
      "supplementalGroups": "2,3,5,8",
      "uidGidSource": "fromTheImage"
    },
    "slotsPerWorker": 1,
    "sshAuthMountPath": "/root/.ssh",
    "stdin": true,
    "storage": {
      "configMapVolume": [
        {
          "name": "storage-instance-a",
          "configMap": "text",
          "mountPath": "text",
          "subPath": "text",
          "defaultMode": "0644",
          "exclude": false
        }
      ],
      "dataVolume": [
        {
          "id": "123e4567-e89b-12d3-a456-426614174000",
          "mountPath": "/mnt/data",
          "exclude": false
        }
      ],
      "emptyDirVolume": [
        {
          "name": "storage-instance-a",
          "path": "/mnt/emptydir",
          "medium": "text",
          "sizeLimit": "1G",
          "exclude": false
        }
      ],
      "git": [
        {
          "name": "storage-instance-a",
          "repository": "https://github.com/my-git/my-repo",
          "branch": "main",
          "revision": "text",
          "path": "/container/my-repository",
          "passwordSecret": "my-password-secret",
          "secretKeyOfUser": "User",
          "secretKeyOfPassword": "Password",
          "exclude": false,
          "secretRef": {
            "name": "my-password-secret",
            "authenticationMethod": "password",
            "secretKeyOfUser": "User",
            "secretKeyOfPassword": "Password"
          }
        }
      ],
      "hostPath": [
        {
          "name": "storage-instance-a",
          "path": "/container/directory",
          "readOnly": true,
          "mountPath": "/local/directory",
          "mountPropagation": "None",
          "exclude": false
        }
      ],
      "nfs": [
        {
          "name": "storage-instance-a",
          "path": "/container/nfs",
          "readOnly": true,
          "server": "my.nfs.com",
          "mountPath": "/local/nfs",
          "exclude": false
        }
      ],
      "pvc": [
        {
          "name": "storage-instance-a",
          "path": "/container/my-claim",
          "existingPvc": false,
          "claimName": "my-claim",
          "readOnly": false,
          "ephemeral": false,
          "claimInfo": {
            "size": "1G",
            "storageClass": "my-storage-class",
            "accessModes": {
              "readWriteOnce": true,
              "readOnlyMany": false,
              "readWriteMany": false
            },
            "volumeMode": "Filesystem",
            "addedAttrValues": [
              {
                "key": "dnsname",
                "value": "my.dns.com"
              }
            ]
          },
          "dataSharing": false,
          "exclude": false
        }
      ],
      "s3": [
        {
          "name": "storage-instance-a",
          "bucket": "my-bucket",
          "path": "/container/my-bucket",
          "url": "https://s3.amazonaws.com",
          "accessKeySecret": "my-access-key-secret",
          "secretKeyOfAccessKeyId": "AccessKeyId",
          "secretKeyOfSecretKey": "SecretKey",
          "exclude": false
        }
      ],
      "secretVolume": [
        {
          "name": "storage-instance-a",
          "mountPath": "text",
          "defaultMode": "0644",
          "secret": "text",
          "exclude": false
        }
      ]
    },
    "terminateAfterPreemption": false,
    "terminationGracePeriodSeconds": 20,
    "tolerations": [
      {
        "name": "text",
        "operator": "Equal",
        "key": "text",
        "value": "text",
        "effect": "NoSchedule",
        "seconds": 1,
        "exclude": false
      }
    ],
    "tty": true,
    "workingDir": "/home/myfolder"
  },
  "masterSpecSameAsWorker": true,
  "masterSpec": {
    "annotations": [
      {
        "name": "billing",
        "value": "my-billing-unit",
        "exclude": false
      }
    ],
    "args": "-x my-script.py",
    "category": "text",
    "command": "python",
    "compute": {
      "cpuCoreLimit": 2,
      "cpuCoreRequest": 0.5,
      "cpuMemoryLimit": "30M",
      "cpuMemoryRequest": "20M",
      "extendedResources": [
        {
          "resource": "hardware-vendor.example/foo",
          "quantity": 2,
          "exclude": false
        }
      ],
      "gpuDevicesRequest": 1,
      "gpuMemoryLimit": "10M",
      "gpuMemoryRequest": "10M",
      "gpuPortionLimit": 0.5,
      "gpuPortionRequest": 0.5,
      "gpuRequestType": "portion",
      "largeShmRequest": false
    },
    "createHomeDir": true,
    "environmentVariables": [
      {
        "name": "HOME",
        "value": "/home/my-folder",
        "secret": {
          "name": "postgress_secret",
          "key": "POSTGRES_PASSWORD"
        },
        "configMap": {
          "name": "my-config-map",
          "key": "MY_POSTGRES_SCHEMA"
        },
        "podFieldRef": {
          "path": "metadata.name"
        },
        "userCredential": {
          "name": "my_postgres_user_and_password",
          "key": "POSTGRES_PASSWORD"
        },
        "exclude": false,
        "description": "Home directory of the user."
      }
    ],
    "exposedUrls": [
      {
        "container": 8080,
        "url": "https://my-url.com",
        "authorizationType": "authenticatedUsers",
        "authorizedUsers": [
          "user-a",
          "user-b"
        ],
        "authorizedGroups": [
          "group-a",
          "group-b"
        ],
        "toolType": "jupyter",
        "toolName": "my-pytorch",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "image": "python:3.8",
    "imagePullPolicy": "Always",
    "imagePullSecrets": [
      {
        "name": "text",
        "userCredential": true,
        "exclude": false
      }
    ],
    "labels": [
      {
        "name": "stage",
        "value": "initial-research",
        "exclude": false
      }
    ],
    "nodeAffinityRequired": {
      "nodeSelectorTerms": [
        {
          "matchExpressions": [
            {
              "key": "text",
              "operator": "In",
              "values": [
                "text"
              ]
            }
          ]
        }
      ]
    },
    "nodePools": [
      "my-node-pool-a",
      "my-node-pool-b"
    ],
    "nodeType": "my-node-type",
    "podAffinity": {
      "type": "Required",
      "key": "text"
    },
    "ports": [
      {
        "container": 8080,
        "serviceType": "LoadBalancer",
        "external": 30080,
        "toolType": "pytorch",
        "toolName": "my-pytorch",
        "name": "port-instance-a",
        "exclude": false
      }
    ],
    "preemptibility": "preemptible",
    "probes": {
      "readiness": {
        "initialDelaySeconds": 1,
        "periodSeconds": 1,
        "timeoutSeconds": 1,
        "successThreshold": 1,
        "failureThreshold": 1,
        "handler": {
          "httpGet": {
            "path": "/",
            "port": 1,
            "host": "example.com",
            "scheme": "HTTP"
          }
        }
      }
    },
    "relatedUrls": [
      {
        "url": "https://my-url.com",
        "type": "wandb",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "restartPolicy": "Always",
    "security": {
      "allowPrivilegeEscalation": false,
      "capabilities": [
        "CHOWN",
        "KILL"
      ],
      "hostIpc": false,
      "hostNetwork": false,
      "readOnlyRootFilesystem": false,
      "runAsGid": 30,
      "runAsNonRoot": true,
      "runAsUid": 500,
      "seccompProfileType": "RuntimeDefault",
      "supplementalGroups": "2,3,5,8",
      "uidGidSource": "fromTheImage"
    },
    "stdin": true,
    "storage": {
      "configMapVolume": [
        {
          "name": "storage-instance-a",
          "configMap": "text",
          "mountPath": "text",
          "subPath": "text",
          "defaultMode": "0644",
          "exclude": false
        }
      ],
      "dataVolume": [
        {
          "id": "123e4567-e89b-12d3-a456-426614174000",
          "mountPath": "/mnt/data",
          "exclude": false
        }
      ],
      "emptyDirVolume": [
        {
          "name": "storage-instance-a",
          "path": "/mnt/emptydir",
          "medium": "text",
          "sizeLimit": "1G",
          "exclude": false
        }
      ],
      "git": [
        {
          "name": "storage-instance-a",
          "repository": "https://github.com/my-git/my-repo",
          "branch": "main",
          "revision": "text",
          "path": "/container/my-repository",
          "passwordSecret": "my-password-secret",
          "secretKeyOfUser": "User",
          "secretKeyOfPassword": "Password",
          "exclude": false,
          "secretRef": {
            "name": "my-password-secret",
            "authenticationMethod": "password",
            "secretKeyOfUser": "User",
            "secretKeyOfPassword": "Password"
          }
        }
      ],
      "hostPath": [
        {
          "name": "storage-instance-a",
          "path": "/container/directory",
          "readOnly": true,
          "mountPath": "/local/directory",
          "mountPropagation": "None",
          "exclude": false
        }
      ],
      "nfs": [
        {
          "name": "storage-instance-a",
          "path": "/container/nfs",
          "readOnly": true,
          "server": "my.nfs.com",
          "mountPath": "/local/nfs",
          "exclude": false
        }
      ],
      "pvc": [
        {
          "name": "storage-instance-a",
          "path": "/container/my-claim",
          "existingPvc": false,
          "claimName": "my-claim",
          "readOnly": false,
          "ephemeral": false,
          "claimInfo": {
            "size": "1G",
            "storageClass": "my-storage-class",
            "accessModes": {
              "readWriteOnce": true,
              "readOnlyMany": false,
              "readWriteMany": false
            },
            "volumeMode": "Filesystem",
            "addedAttrValues": [
              {
                "key": "dnsname",
                "value": "my.dns.com"
              }
            ]
          },
          "dataSharing": false,
          "exclude": false
        }
      ],
      "s3": [
        {
          "name": "storage-instance-a",
          "bucket": "my-bucket",
          "path": "/container/my-bucket",
          "url": "https://s3.amazonaws.com",
          "accessKeySecret": "my-access-key-secret",
          "secretKeyOfAccessKeyId": "AccessKeyId",
          "secretKeyOfSecretKey": "SecretKey",
          "exclude": false
        }
      ],
      "secretVolume": [
        {
          "name": "storage-instance-a",
          "mountPath": "text",
          "defaultMode": "0644",
          "secret": "text",
          "exclude": false
        }
      ]
    },
    "terminateAfterPreemption": false,
    "tolerations": [
      {
        "name": "text",
        "operator": "Equal",
        "key": "text",
        "value": "text",
        "effect": "NoSchedule",
        "seconds": 1,
        "exclude": false
      }
    ],
    "tty": true,
    "workingDir": "/home/myfolder"
  }
}

{
  "name": "my-workload-name",
  "requestedName": "text",
  "workloadId": "123e4567-e89b-12d3-a456-426614174000",
  "projectId": 1,
  "departmentId": 2,
  "clusterId": "71f69d83-ba66-4822-adf5-55ce55efd210",
  "createdBy": "[email protected]",
  "createdAt": "2022-01-01T03:49:52.531Z",
  "deletedAt": "2022-01-01T03:49:52.531Z",
  "desiredPhase": "Running",
  "actualPhase": "Creating",
  "spec": {
    "annotations": [
      {
        "name": "billing",
        "value": "my-billing-unit",
        "exclude": false
      }
    ],
    "args": "-x my-script.py",
    "autoDeletionTimeAfterCompletionSeconds": 15,
    "backoffLimit": 3,
    "category": "text",
    "cleanPodPolicy": "None",
    "command": "python",
    "compute": {
      "cpuCoreLimit": 2,
      "cpuCoreRequest": 0.5,
      "cpuMemoryLimit": "30M",
      "cpuMemoryRequest": "20M",
      "extendedResources": [
        {
          "resource": "hardware-vendor.example/foo",
          "quantity": 2,
          "exclude": false
        }
      ],
      "gpuDevicesRequest": 1,
      "gpuMemoryLimit": "10M",
      "gpuMemoryRequest": "10M",
      "gpuPortionLimit": 0.5,
      "gpuPortionRequest": 0.5,
      "gpuRequestType": "portion",
      "largeShmRequest": false
    },
    "createHomeDir": true,
    "distributedFramework": "MPI",
    "environmentVariables": [
      {
        "name": "HOME",
        "value": "/home/my-folder",
        "secret": {
          "name": "postgress_secret",
          "key": "POSTGRES_PASSWORD"
        },
        "configMap": {
          "name": "my-config-map",
          "key": "MY_POSTGRES_SCHEMA"
        },
        "podFieldRef": {
          "path": "metadata.name"
        },
        "userCredential": {
          "name": "my_postgres_user_and_password",
          "key": "POSTGRES_PASSWORD"
        },
        "exclude": false,
        "description": "Home directory of the user."
      }
    ],
    "exposedUrls": [
      {
        "container": 8080,
        "url": "https://my-url.com",
        "authorizationType": "authenticatedUsers",
        "authorizedUsers": [
          "user-a",
          "user-b"
        ],
        "authorizedGroups": [
          "group-a",
          "group-b"
        ],
        "toolType": "jupyter",
        "toolName": "my-pytorch",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "image": "python:3.8",
    "imagePullPolicy": "Always",
    "imagePullSecrets": [
      {
        "name": "text",
        "userCredential": true,
        "exclude": false
      }
    ],
    "labels": [
      {
        "name": "stage",
        "value": "initial-research",
        "exclude": false
      }
    ],
    "maxReplicas": 1,
    "minReplicas": 1,
    "mpiLauncherCreationPolicy": "AtStartup",
    "nodeAffinityRequired": {
      "nodeSelectorTerms": [
        {
          "matchExpressions": [
            {
              "key": "text",
              "operator": "In",
              "values": [
                "text"
              ]
            }
          ]
        }
      ]
    },
    "nodePools": [
      "my-node-pool-a",
      "my-node-pool-b"
    ],
    "nodeType": "my-node-type",
    "numWorkers": 1,
    "podAffinity": {
      "type": "Required",
      "key": "text"
    },
    "ports": [
      {
        "container": 8080,
        "serviceType": "LoadBalancer",
        "external": 30080,
        "toolType": "pytorch",
        "toolName": "my-pytorch",
        "name": "port-instance-a",
        "exclude": false
      }
    ],
    "preemptibility": "preemptible",
    "priorityClass": "text",
    "probes": {
      "readiness": {
        "initialDelaySeconds": 1,
        "periodSeconds": 1,
        "timeoutSeconds": 1,
        "successThreshold": 1,
        "failureThreshold": 1,
        "handler": {
          "httpGet": {
            "path": "/",
            "port": 1,
            "host": "example.com",
            "scheme": "HTTP"
          }
        }
      }
    },
    "relatedUrls": [
      {
        "url": "https://my-url.com",
        "type": "wandb",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "restartPolicy": "Always",
    "security": {
      "allowPrivilegeEscalation": false,
      "capabilities": [
        "CHOWN",
        "KILL"
      ],
      "hostIpc": false,
      "hostNetwork": false,
      "readOnlyRootFilesystem": false,
      "runAsGid": 30,
      "runAsNonRoot": true,
      "runAsUid": 500,
      "seccompProfileType": "RuntimeDefault",
      "supplementalGroups": "2,3,5,8",
      "uidGidSource": "fromTheImage"
    },
    "slotsPerWorker": 1,
    "sshAuthMountPath": "/root/.ssh",
    "stdin": true,
    "storage": {
      "configMapVolume": [
        {
          "name": "storage-instance-a",
          "configMap": "text",
          "mountPath": "text",
          "subPath": "text",
          "defaultMode": "0644",
          "exclude": false
        }
      ],
      "dataVolume": [
        {
          "id": "123e4567-e89b-12d3-a456-426614174000",
          "mountPath": "/mnt/data",
          "exclude": false
        }
      ],
      "emptyDirVolume": [
        {
          "name": "storage-instance-a",
          "path": "/mnt/emptydir",
          "medium": "text",
          "sizeLimit": "1G",
          "exclude": false
        }
      ],
      "git": [
        {
          "name": "storage-instance-a",
          "repository": "https://github.com/my-git/my-repo",
          "branch": "main",
          "revision": "text",
          "path": "/container/my-repository",
          "passwordSecret": "my-password-secret",
          "secretKeyOfUser": "User",
          "secretKeyOfPassword": "Password",
          "exclude": false,
          "secretRef": {
            "name": "my-password-secret",
            "authenticationMethod": "password",
            "secretKeyOfUser": "User",
            "secretKeyOfPassword": "Password"
          }
        }
      ],
      "hostPath": [
        {
          "name": "storage-instance-a",
          "path": "/container/directory",
          "readOnly": true,
          "mountPath": "/local/directory",
          "mountPropagation": "None",
          "exclude": false
        }
      ],
      "nfs": [
        {
          "name": "storage-instance-a",
          "path": "/container/nfs",
          "readOnly": true,
          "server": "my.nfs.com",
          "mountPath": "/local/nfs",
          "exclude": false
        }
      ],
      "pvc": [
        {
          "name": "storage-instance-a",
          "path": "/container/my-claim",
          "existingPvc": false,
          "claimName": "my-claim",
          "readOnly": false,
          "ephemeral": false,
          "claimInfo": {
            "size": "1G",
            "storageClass": "my-storage-class",
            "accessModes": {
              "readWriteOnce": true,
              "readOnlyMany": false,
              "readWriteMany": false
            },
            "volumeMode": "Filesystem",
            "addedAttrValues": [
              {
                "key": "dnsname",
                "value": "my.dns.com"
              }
            ]
          },
          "dataSharing": false,
          "exclude": false
        }
      ],
      "s3": [
        {
          "name": "storage-instance-a",
          "bucket": "my-bucket",
          "path": "/container/my-bucket",
          "url": "https://s3.amazonaws.com",
          "accessKeySecret": "my-access-key-secret",
          "secretKeyOfAccessKeyId": "AccessKeyId",
          "secretKeyOfSecretKey": "SecretKey",
          "exclude": false
        }
      ],
      "secretVolume": [
        {
          "name": "storage-instance-a",
          "mountPath": "text",
          "defaultMode": "0644",
          "secret": "text",
          "exclude": false
        }
      ]
    },
    "terminateAfterPreemption": false,
    "terminationGracePeriodSeconds": 20,
    "tolerations": [
      {
        "name": "text",
        "operator": "Equal",
        "key": "text",
        "value": "text",
        "effect": "NoSchedule",
        "seconds": 1,
        "exclude": false
      }
    ],
    "tty": true,
    "workingDir": "/home/myfolder"
  },
  "masterSpecSameAsWorker": true,
  "masterSpec": {
    "annotations": [
      {
        "name": "billing",
        "value": "my-billing-unit",
        "exclude": false
      }
    ],
    "args": "-x my-script.py",
    "category": "text",
    "command": "python",
    "compute": {
      "cpuCoreLimit": 2,
      "cpuCoreRequest": 0.5,
      "cpuMemoryLimit": "30M",
      "cpuMemoryRequest": "20M",
      "extendedResources": [
        {
          "resource": "hardware-vendor.example/foo",
          "quantity": 2,
          "exclude": false
        }
      ],
      "gpuDevicesRequest": 1,
      "gpuMemoryLimit": "10M",
      "gpuMemoryRequest": "10M",
      "gpuPortionLimit": 0.5,
      "gpuPortionRequest": 0.5,
      "gpuRequestType": "portion",
      "largeShmRequest": false
    },
    "createHomeDir": true,
    "environmentVariables": [
      {
        "name": "HOME",
        "value": "/home/my-folder",
        "secret": {
          "name": "postgress_secret",
          "key": "POSTGRES_PASSWORD"
        },
        "configMap": {
          "name": "my-config-map",
          "key": "MY_POSTGRES_SCHEMA"
        },
        "podFieldRef": {
          "path": "metadata.name"
        },
        "userCredential": {
          "name": "my_postgres_user_and_password",
          "key": "POSTGRES_PASSWORD"
        },
        "exclude": false,
        "description": "Home directory of the user."
      }
    ],
    "exposedUrls": [
      {
        "container": 8080,
        "url": "https://my-url.com",
        "authorizationType": "authenticatedUsers",
        "authorizedUsers": [
          "user-a",
          "user-b"
        ],
        "authorizedGroups": [
          "group-a",
          "group-b"
        ],
        "toolType": "jupyter",
        "toolName": "my-pytorch",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "image": "python:3.8",
    "imagePullPolicy": "Always",
    "imagePullSecrets": [
      {
        "name": "text",
        "userCredential": true,
        "exclude": false
      }
    ],
    "labels": [
      {
        "name": "stage",
        "value": "initial-research",
        "exclude": false
      }
    ],
    "nodeAffinityRequired": {
      "nodeSelectorTerms": [
        {
          "matchExpressions": [
            {
              "key": "text",
              "operator": "In",
              "values": [
                "text"
              ]
            }
          ]
        }
      ]
    },
    "nodePools": [
      "my-node-pool-a",
      "my-node-pool-b"
    ],
    "nodeType": "my-node-type",
    "podAffinity": {
      "type": "Required",
      "key": "text"
    },
    "ports": [
      {
        "container": 8080,
        "serviceType": "LoadBalancer",
        "external": 30080,
        "toolType": "pytorch",
        "toolName": "my-pytorch",
        "name": "port-instance-a",
        "exclude": false
      }
    ],
    "preemptibility": "preemptible",
    "probes": {
      "readiness": {
        "initialDelaySeconds": 1,
        "periodSeconds": 1,
        "timeoutSeconds": 1,
        "successThreshold": 1,
        "failureThreshold": 1,
        "handler": {
          "httpGet": {
            "path": "/",
            "port": 1,
            "host": "example.com",
            "scheme": "HTTP"
          }
        }
      }
    },
    "relatedUrls": [
      {
        "url": "https://my-url.com",
        "type": "wandb",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "restartPolicy": "Always",
    "security": {
      "allowPrivilegeEscalation": false,
      "capabilities": [
        "CHOWN",
        "KILL"
      ],
      "hostIpc": false,
      "hostNetwork": false,
      "readOnlyRootFilesystem": false,
      "runAsGid": 30,
      "runAsNonRoot": true,
      "runAsUid": 500,
      "seccompProfileType": "RuntimeDefault",
      "supplementalGroups": "2,3,5,8",
      "uidGidSource": "fromTheImage"
    },
    "stdin": true,
    "storage": {
      "configMapVolume": [
        {
          "name": "storage-instance-a",
          "configMap": "text",
          "mountPath": "text",
          "subPath": "text",
          "defaultMode": "0644",
          "exclude": false
        }
      ],
      "dataVolume": [
        {
          "id": "123e4567-e89b-12d3-a456-426614174000",
          "mountPath": "/mnt/data",
          "exclude": false
        }
      ],
      "emptyDirVolume": [
        {
          "name": "storage-instance-a",
          "path": "/mnt/emptydir",
          "medium": "text",
          "sizeLimit": "1G",
          "exclude": false
        }
      ],
      "git": [
        {
          "name": "storage-instance-a",
          "repository": "https://github.com/my-git/my-repo",
          "branch": "main",
          "revision": "text",
          "path": "/container/my-repository",
          "passwordSecret": "my-password-secret",
          "secretKeyOfUser": "User",
          "secretKeyOfPassword": "Password",
          "exclude": false,
          "secretRef": {
            "name": "my-password-secret",
            "authenticationMethod": "password",
            "secretKeyOfUser": "User",
            "secretKeyOfPassword": "Password"
          }
        }
      ],
      "hostPath": [
        {
          "name": "storage-instance-a",
          "path": "/container/directory",
          "readOnly": true,
          "mountPath": "/local/directory",
          "mountPropagation": "None",
          "exclude": false
        }
      ],
      "nfs": [
        {
          "name": "storage-instance-a",
          "path": "/container/nfs",
          "readOnly": true,
          "server": "my.nfs.com",
          "mountPath": "/local/nfs",
          "exclude": false
        }
      ],
      "pvc": [
        {
          "name": "storage-instance-a",
          "path": "/container/my-claim",
          "existingPvc": false,
          "claimName": "my-claim",
          "readOnly": false,
          "ephemeral": false,
          "claimInfo": {
            "size": "1G",
            "storageClass": "my-storage-class",
            "accessModes": {
              "readWriteOnce": true,
              "readOnlyMany": false,
              "readWriteMany": false
            },
            "volumeMode": "Filesystem",
            "addedAttrValues": [
              {
                "key": "dnsname",
                "value": "my.dns.com"
              }
            ]
          },
          "dataSharing": false,
          "exclude": false
        }
      ],
      "s3": [
        {
          "name": "storage-instance-a",
          "bucket": "my-bucket",
          "path": "/container/my-bucket",
          "url": "https://s3.amazonaws.com",
          "accessKeySecret": "my-access-key-secret",
          "secretKeyOfAccessKeyId": "AccessKeyId",
          "secretKeyOfSecretKey": "SecretKey",
          "exclude": false
        }
      ],
      "secretVolume": [
        {
          "name": "storage-instance-a",
          "mountPath": "text",
          "defaultMode": "0644",
          "secret": "text",
          "exclude": false
        }
      ]
    },
    "terminateAfterPreemption": false,
    "tolerations": [
      {
        "name": "text",
        "operator": "Equal",
        "key": "text",
        "value": "text",
        "effect": "NoSchedule",
        "seconds": 1,
        "exclude": false
      }
    ],
    "tty": true,
    "workingDir": "/home/myfolder"
  }
}

Get distributed training's data. [Experimental]

get

Retrieve the details of a distributed training by workload id.

Authorizations

AuthorizationstringRequired

Bearer authentication

Path parameters

workloadIdstring · uuidRequired

The Universally Unique Identifier (UUID) of the workload.

Responses

200

Executed successfully.

application/json

401

Unauthorized

application/json

403

Forbidden

application/json

404

The specified resource was not found

application/json

500

unexpected error

application/json

503

unexpected error

application/json

get

/api/v1/workloads/distributed/{workloadId}

GET /api/v1/workloads/distributed/{workloadId} HTTP/1.1
Host: app.run.ai
Authorization: Bearer YOUR_SECRET_TOKEN
Accept: */*

{
  "name": "my-workload-name",
  "requestedName": "text",
  "workloadId": "123e4567-e89b-12d3-a456-426614174000",
  "projectId": 1,
  "departmentId": 2,
  "clusterId": "71f69d83-ba66-4822-adf5-55ce55efd210",
  "createdBy": "[email protected]",
  "createdAt": "2022-01-01T03:49:52.531Z",
  "deletedAt": "2022-01-01T03:49:52.531Z",
  "desiredPhase": "Running",
  "actualPhase": "Creating",
  "spec": {
    "annotations": [
      {
        "name": "billing",
        "value": "my-billing-unit",
        "exclude": false
      }
    ],
    "args": "-x my-script.py",
    "autoDeletionTimeAfterCompletionSeconds": 15,
    "backoffLimit": 3,
    "category": "text",
    "cleanPodPolicy": "None",
    "command": "python",
    "compute": {
      "cpuCoreLimit": 2,
      "cpuCoreRequest": 0.5,
      "cpuMemoryLimit": "30M",
      "cpuMemoryRequest": "20M",
      "extendedResources": [
        {
          "resource": "hardware-vendor.example/foo",
          "quantity": 2,
          "exclude": false
        }
      ],
      "gpuDevicesRequest": 1,
      "gpuMemoryLimit": "10M",
      "gpuMemoryRequest": "10M",
      "gpuPortionLimit": 0.5,
      "gpuPortionRequest": 0.5,
      "gpuRequestType": "portion",
      "largeShmRequest": false
    },
    "createHomeDir": true,
    "distributedFramework": "MPI",
    "environmentVariables": [
      {
        "name": "HOME",
        "value": "/home/my-folder",
        "secret": {
          "name": "postgress_secret",
          "key": "POSTGRES_PASSWORD"
        },
        "configMap": {
          "name": "my-config-map",
          "key": "MY_POSTGRES_SCHEMA"
        },
        "podFieldRef": {
          "path": "metadata.name"
        },
        "userCredential": {
          "name": "my_postgres_user_and_password",
          "key": "POSTGRES_PASSWORD"
        },
        "exclude": false,
        "description": "Home directory of the user."
      }
    ],
    "exposedUrls": [
      {
        "container": 8080,
        "url": "https://my-url.com",
        "authorizationType": "authenticatedUsers",
        "authorizedUsers": [
          "user-a",
          "user-b"
        ],
        "authorizedGroups": [
          "group-a",
          "group-b"
        ],
        "toolType": "jupyter",
        "toolName": "my-pytorch",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "image": "python:3.8",
    "imagePullPolicy": "Always",
    "imagePullSecrets": [
      {
        "name": "text",
        "userCredential": true,
        "exclude": false
      }
    ],
    "labels": [
      {
        "name": "stage",
        "value": "initial-research",
        "exclude": false
      }
    ],
    "maxReplicas": 1,
    "minReplicas": 1,
    "mpiLauncherCreationPolicy": "AtStartup",
    "nodeAffinityRequired": {
      "nodeSelectorTerms": [
        {
          "matchExpressions": [
            {
              "key": "text",
              "operator": "In",
              "values": [
                "text"
              ]
            }
          ]
        }
      ]
    },
    "nodePools": [
      "my-node-pool-a",
      "my-node-pool-b"
    ],
    "nodeType": "my-node-type",
    "numWorkers": 1,
    "podAffinity": {
      "type": "Required",
      "key": "text"
    },
    "ports": [
      {
        "container": 8080,
        "serviceType": "LoadBalancer",
        "external": 30080,
        "toolType": "pytorch",
        "toolName": "my-pytorch",
        "name": "port-instance-a",
        "exclude": false
      }
    ],
    "preemptibility": "preemptible",
    "priorityClass": "text",
    "probes": {
      "readiness": {
        "initialDelaySeconds": 1,
        "periodSeconds": 1,
        "timeoutSeconds": 1,
        "successThreshold": 1,
        "failureThreshold": 1,
        "handler": {
          "httpGet": {
            "path": "/",
            "port": 1,
            "host": "example.com",
            "scheme": "HTTP"
          }
        }
      }
    },
    "relatedUrls": [
      {
        "url": "https://my-url.com",
        "type": "wandb",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "restartPolicy": "Always",
    "security": {
      "allowPrivilegeEscalation": false,
      "capabilities": [
        "CHOWN",
        "KILL"
      ],
      "hostIpc": false,
      "hostNetwork": false,
      "readOnlyRootFilesystem": false,
      "runAsGid": 30,
      "runAsNonRoot": true,
      "runAsUid": 500,
      "seccompProfileType": "RuntimeDefault",
      "supplementalGroups": "2,3,5,8",
      "uidGidSource": "fromTheImage"
    },
    "slotsPerWorker": 1,
    "sshAuthMountPath": "/root/.ssh",
    "stdin": true,
    "storage": {
      "configMapVolume": [
        {
          "name": "storage-instance-a",
          "configMap": "text",
          "mountPath": "text",
          "subPath": "text",
          "defaultMode": "0644",
          "exclude": false
        }
      ],
      "dataVolume": [
        {
          "id": "123e4567-e89b-12d3-a456-426614174000",
          "mountPath": "/mnt/data",
          "exclude": false
        }
      ],
      "emptyDirVolume": [
        {
          "name": "storage-instance-a",
          "path": "/mnt/emptydir",
          "medium": "text",
          "sizeLimit": "1G",
          "exclude": false
        }
      ],
      "git": [
        {
          "name": "storage-instance-a",
          "repository": "https://github.com/my-git/my-repo",
          "branch": "main",
          "revision": "text",
          "path": "/container/my-repository",
          "passwordSecret": "my-password-secret",
          "secretKeyOfUser": "User",
          "secretKeyOfPassword": "Password",
          "exclude": false,
          "secretRef": {
            "name": "my-password-secret",
            "authenticationMethod": "password",
            "secretKeyOfUser": "User",
            "secretKeyOfPassword": "Password"
          }
        }
      ],
      "hostPath": [
        {
          "name": "storage-instance-a",
          "path": "/container/directory",
          "readOnly": true,
          "mountPath": "/local/directory",
          "mountPropagation": "None",
          "exclude": false
        }
      ],
      "nfs": [
        {
          "name": "storage-instance-a",
          "path": "/container/nfs",
          "readOnly": true,
          "server": "my.nfs.com",
          "mountPath": "/local/nfs",
          "exclude": false
        }
      ],
      "pvc": [
        {
          "name": "storage-instance-a",
          "path": "/container/my-claim",
          "existingPvc": false,
          "claimName": "my-claim",
          "readOnly": false,
          "ephemeral": false,
          "claimInfo": {
            "size": "1G",
            "storageClass": "my-storage-class",
            "accessModes": {
              "readWriteOnce": true,
              "readOnlyMany": false,
              "readWriteMany": false
            },
            "volumeMode": "Filesystem",
            "addedAttrValues": [
              {
                "key": "dnsname",
                "value": "my.dns.com"
              }
            ]
          },
          "dataSharing": false,
          "exclude": false
        }
      ],
      "s3": [
        {
          "name": "storage-instance-a",
          "bucket": "my-bucket",
          "path": "/container/my-bucket",
          "url": "https://s3.amazonaws.com",
          "accessKeySecret": "my-access-key-secret",
          "secretKeyOfAccessKeyId": "AccessKeyId",
          "secretKeyOfSecretKey": "SecretKey",
          "exclude": false
        }
      ],
      "secretVolume": [
        {
          "name": "storage-instance-a",
          "mountPath": "text",
          "defaultMode": "0644",
          "secret": "text",
          "exclude": false
        }
      ]
    },
    "terminateAfterPreemption": false,
    "terminationGracePeriodSeconds": 20,
    "tolerations": [
      {
        "name": "text",
        "operator": "Equal",
        "key": "text",
        "value": "text",
        "effect": "NoSchedule",
        "seconds": 1,
        "exclude": false
      }
    ],
    "tty": true,
    "workingDir": "/home/myfolder"
  },
  "masterSpecSameAsWorker": true,
  "masterSpec": {
    "annotations": [
      {
        "name": "billing",
        "value": "my-billing-unit",
        "exclude": false
      }
    ],
    "args": "-x my-script.py",
    "category": "text",
    "command": "python",
    "compute": {
      "cpuCoreLimit": 2,
      "cpuCoreRequest": 0.5,
      "cpuMemoryLimit": "30M",
      "cpuMemoryRequest": "20M",
      "extendedResources": [
        {
          "resource": "hardware-vendor.example/foo",
          "quantity": 2,
          "exclude": false
        }
      ],
      "gpuDevicesRequest": 1,
      "gpuMemoryLimit": "10M",
      "gpuMemoryRequest": "10M",
      "gpuPortionLimit": 0.5,
      "gpuPortionRequest": 0.5,
      "gpuRequestType": "portion",
      "largeShmRequest": false
    },
    "createHomeDir": true,
    "environmentVariables": [
      {
        "name": "HOME",
        "value": "/home/my-folder",
        "secret": {
          "name": "postgress_secret",
          "key": "POSTGRES_PASSWORD"
        },
        "configMap": {
          "name": "my-config-map",
          "key": "MY_POSTGRES_SCHEMA"
        },
        "podFieldRef": {
          "path": "metadata.name"
        },
        "userCredential": {
          "name": "my_postgres_user_and_password",
          "key": "POSTGRES_PASSWORD"
        },
        "exclude": false,
        "description": "Home directory of the user."
      }
    ],
    "exposedUrls": [
      {
        "container": 8080,
        "url": "https://my-url.com",
        "authorizationType": "authenticatedUsers",
        "authorizedUsers": [
          "user-a",
          "user-b"
        ],
        "authorizedGroups": [
          "group-a",
          "group-b"
        ],
        "toolType": "jupyter",
        "toolName": "my-pytorch",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "image": "python:3.8",
    "imagePullPolicy": "Always",
    "imagePullSecrets": [
      {
        "name": "text",
        "userCredential": true,
        "exclude": false
      }
    ],
    "labels": [
      {
        "name": "stage",
        "value": "initial-research",
        "exclude": false
      }
    ],
    "nodeAffinityRequired": {
      "nodeSelectorTerms": [
        {
          "matchExpressions": [
            {
              "key": "text",
              "operator": "In",
              "values": [
                "text"
              ]
            }
          ]
        }
      ]
    },
    "nodePools": [
      "my-node-pool-a",
      "my-node-pool-b"
    ],
    "nodeType": "my-node-type",
    "podAffinity": {
      "type": "Required",
      "key": "text"
    },
    "ports": [
      {
        "container": 8080,
        "serviceType": "LoadBalancer",
        "external": 30080,
        "toolType": "pytorch",
        "toolName": "my-pytorch",
        "name": "port-instance-a",
        "exclude": false
      }
    ],
    "preemptibility": "preemptible",
    "probes": {
      "readiness": {
        "initialDelaySeconds": 1,
        "periodSeconds": 1,
        "timeoutSeconds": 1,
        "successThreshold": 1,
        "failureThreshold": 1,
        "handler": {
          "httpGet": {
            "path": "/",
            "port": 1,
            "host": "example.com",
            "scheme": "HTTP"
          }
        }
      }
    },
    "relatedUrls": [
      {
        "url": "https://my-url.com",
        "type": "wandb",
        "name": "url-instance-a",
        "exclude": false
      }
    ],
    "restartPolicy": "Always",
    "security": {
      "allowPrivilegeEscalation": false,
      "capabilities": [
        "CHOWN",
        "KILL"
      ],
      "hostIpc": false,
      "hostNetwork": false,
      "readOnlyRootFilesystem": false,
      "runAsGid": 30,
      "runAsNonRoot": true,
      "runAsUid": 500,
      "seccompProfileType": "RuntimeDefault",
      "supplementalGroups": "2,3,5,8",
      "uidGidSource": "fromTheImage"
    },
    "stdin": true,
    "storage": {
      "configMapVolume": [
        {
          "name": "storage-instance-a",
          "configMap": "text",
          "mountPath": "text",
          "subPath": "text",
          "defaultMode": "0644",
          "exclude": false
        }
      ],
      "dataVolume": [
        {
          "id": "123e4567-e89b-12d3-a456-426614174000",
          "mountPath": "/mnt/data",
          "exclude": false
        }
      ],
      "emptyDirVolume": [
        {
          "name": "storage-instance-a",
          "path": "/mnt/emptydir",
          "medium": "text",
          "sizeLimit": "1G",
          "exclude": false
        }
      ],
      "git": [
        {
          "name": "storage-instance-a",
          "repository": "https://github.com/my-git/my-repo",
          "branch": "main",
          "revision": "text",
          "path": "/container/my-repository",
          "passwordSecret": "my-password-secret",
          "secretKeyOfUser": "User",
          "secretKeyOfPassword": "Password",
          "exclude": false,
          "secretRef": {
            "name": "my-password-secret",
            "authenticationMethod": "password",
            "secretKeyOfUser": "User",
            "secretKeyOfPassword": "Password"
          }
        }
      ],
      "hostPath": [
        {
          "name": "storage-instance-a",
          "path": "/container/directory",
          "readOnly": true,
          "mountPath": "/local/directory",
          "mountPropagation": "None",
          "exclude": false
        }
      ],
      "nfs": [
        {
          "name": "storage-instance-a",
          "path": "/container/nfs",
          "readOnly": true,
          "server": "my.nfs.com",
          "mountPath": "/local/nfs",
          "exclude": false
        }
      ],
      "pvc": [
        {
          "name": "storage-instance-a",
          "path": "/container/my-claim",
          "existingPvc": false,
          "claimName": "my-claim",
          "readOnly": false,
          "ephemeral": false,
          "claimInfo": {
            "size": "1G",
            "storageClass": "my-storage-class",
            "accessModes": {
              "readWriteOnce": true,
              "readOnlyMany": false,
              "readWriteMany": false
            },
            "volumeMode": "Filesystem",
            "addedAttrValues": [
              {
                "key": "dnsname",
                "value": "my.dns.com"
              }
            ]
          },
          "dataSharing": false,
          "exclude": false
        }
      ],
      "s3": [
        {
          "name": "storage-instance-a",
          "bucket": "my-bucket",
          "path": "/container/my-bucket",
          "url": "https://s3.amazonaws.com",
          "accessKeySecret": "my-access-key-secret",
          "secretKeyOfAccessKeyId": "AccessKeyId",
          "secretKeyOfSecretKey": "SecretKey",
          "exclude": false
        }
      ],
      "secretVolume": [
        {
          "name": "storage-instance-a",
          "mountPath": "text",
          "defaultMode": "0644",
          "secret": "text",
          "exclude": false
        }
      ]
    },
    "terminateAfterPreemption": false,
    "tolerations": [
      {
        "name": "text",
        "operator": "Equal",
        "key": "text",
        "value": "text",
        "effect": "NoSchedule",
        "seconds": 1,
        "exclude": false
      }
    ],
    "tty": true,
    "workingDir": "/home/myfolder"
  }
}

Delete a distributed training by id.

delete

Use to delete a distributed training by workload id.

Authorizations

AuthorizationstringRequired

Bearer authentication

Path parameters

workloadIdstring · uuidRequired

The Universally Unique Identifier (UUID) of the workload.

Responses

202

Accepted.

application/json

401

Unauthorized

application/json

403

Forbidden

application/json

404

The specified resource was not found

application/json

500

unexpected error

application/json

503

unexpected error

application/json

delete

/api/v1/workloads/distributed/{workloadId}

DELETE /api/v1/workloads/distributed/{workloadId} HTTP/1.1
Host: app.run.ai
Authorization: Bearer YOUR_SECRET_TOKEN
Accept: */*

{
  "code": 202,
  "message": "Request has been accepted."
}

Suspend a distributed training.

post

Suspend a distributed training from running using a workload id.

Authorizations

AuthorizationstringRequired

Bearer authentication

Path parameters

workloadIdstring · uuidRequired

The Universally Unique Identifier (UUID) of the workload.

Responses

202

Accepted.

application/json

401

Unauthorized

application/json

403

Forbidden

application/json

404

The specified resource was not found

application/json

500

unexpected error

application/json

503

unexpected error

application/json

post

/api/v1/workloads/distributed/{workloadId}/suspend

POST /api/v1/workloads/distributed/{workloadId}/suspend HTTP/1.1
Host: app.run.ai
Authorization: Bearer YOUR_SECRET_TOKEN
Accept: */*

{
  "code": 202,
  "message": "Request has been accepted."
}

Resume a distributed training.

post

Resume a distributed training that was suspended using a workload id.

Authorizations

AuthorizationstringRequired

Bearer authentication

Path parameters

workloadIdstring · uuidRequired

The Universally Unique Identifier (UUID) of the workload.

Responses

202

Accepted.

application/json

401

Unauthorized

application/json

403

Forbidden

application/json

404

The specified resource was not found

application/json

500

unexpected error

application/json

503

unexpected error

application/json

post

/api/v1/workloads/distributed/{workloadId}/resume

POST /api/v1/workloads/distributed/{workloadId}/resume HTTP/1.1
Host: app.run.ai
Authorization: Bearer YOUR_SECRET_TOKEN
Accept: */*

{
  "code": 202,
  "message": "Request has been accepted."
}

PreviousRevisions NextWorkloads batch

Last updated 3 days ago

Good night

hashtagCreate a distributed training.

hashtagGet distributed training's data. [Experimental]

hashtagDelete a distributed training by id.

hashtagSuspend a distributed training.

hashtagResume a distributed training.

Create a distributed training.

Get distributed training's data. [Experimental]

Delete a distributed training by id.

Suspend a distributed training.

Resume a distributed training.