Skip to content

Nightly Tests

Nightly Tests #863

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
name: Nightly Tests
on:
workflow_dispatch:
schedule: # Schedule the job run at 12AM PST daily.
- cron: '0 8 * * *'
permissions:
contents: read
env:
CLUSTER_NETWORK_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}}"
jobs:
cluster-create-and-delete:
runs-on: [ubuntu-22.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-test-cluster-group
cancel-in-progress: false
env:
EMPTY_CLUSTER_NAME: nightly-xpk-zero-nodepools
PRIVATE_CLUSTER_NAME: nightly-xpk-private-2-v4-8-nodepools
TPU_CLUSTER_NAME: nightly-xpk-2-v4-8-nodepools
WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install expect package
run: sudo apt-get install expect
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Verify gcp setup
run: gcloud info
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk dependencies
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $EMPTY_CLUSTER_NAME --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $EMPTY_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create a Private XPK Cluster with zero node pools
run: python xpk.py cluster create --cluster $PRIVATE_CLUSTER_NAME --private --tpu-type=v4-8 --num-slices=0 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${CLUSTER_NETWORK_ARGUMENTS}'
- name: Verify the created cluster is private
run: gcloud container clusters describe $PRIVATE_CLUSTER_NAME --region=us-central2 --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
- name: Delete the cluster created
run: python xpk.py cluster delete --cluster $PRIVATE_CLUSTER_NAME --zone=us-central2-b --force
if: always()
- name: Create an XPK Cluster with 2x v4-8 nodepools
run: python xpk.py cluster create --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS }}'
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a base-docker-image workload
run: python xpk.py workload create --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b
- name: List out the workloads on the cluster
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Run xpk inspector with the workload created above
run: python3 xpk.py inspector --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --workload $WORKLOAD_NAME
- name: Wait for workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Run xpk info command
run : python3 xpk.py info --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Create test script to execute in batch
run: echo -e '#!/bin/bash \n#SBATCH --unknown-flag=value\n echo "Hello world from a test script!"' > batch.sh
- name: Run a batch job on the cluster
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b batch.sh --ignore-unknown-flags --array 1-5 --nodes 2 --ntasks 3
- name: List out the jobs on the cluster
run: python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-'
- name: Get created job name
run: |
JOB_NAME=$(python3 xpk.py job ls --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep 'xpk-def-app-profile-slurm-' | head -1 | awk '{print $1}')
echo "JOB_NAME=${JOB_NAME}" >> $GITHUB_ENV
- name: Check job spec
run: |
job_spec=$(kubectl get job ${JOB_NAME} -o jsonpath='{.spec}')
echo "$job_spec" | grep '"completions":2'
echo "$job_spec" | grep '"parallelism":2'
echo "$job_spec" | jq '.template.spec.containers | length' | grep 3
- name: Get job info for the last job created on the cluster
run: python3 xpk.py job info ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep -e "Entrypoint environment variables template:" -e "Job name:" -e "Labels:" -e "Mounts:" -e "Pods:" -e "Profile:" -e "Script name:" | wc -l | grep "7"
- name: Cancel the batch job on the cluster
run: python3 xpk.py job cancel ${JOB_NAME} --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | grep "job.batch/${JOB_NAME} deleted"
- name: Create shell and exit it immediately
run: |
cat <<EOF > create-shell.exp
#!/usr/bin/expect
set timeout 180
spawn sh -c "python3 xpk.py shell --cluster $TPU_CLUSTER_NAME --zone=us-central2-b | tee shell.log"
send "\n"
expect {
"/ # " {
send "exit\n"
# Wait for EOF after exit
expect eof
exit 0
}
timeout {
puts "Timed out waiting for pod to be running"
exit 1
}
eof {
puts "Unexpected EOF before getting prompt"
exit 1
}
}
EOF
chmod +x ./create-shell.exp
expect ./create-shell.exp
- name: Check if shell exists and is running
run: |
pod_name=$(grep 'waiting for pod' shell.log | awk -F'"' '{print $2}')
kubectl wait --for='jsonpath={.status.conditions[?(@.type=="Ready")].status}=True' --timeout=1m pod/${pod_name}
- name: Stop the shell
run: python3 xpk.py shell stop --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete create-shell.exp file
run: rm create-shell.exp
- name: Delete shell.log file
run: rm shell.log
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
pw-cluster-and-workload:
runs-on: [ubuntu-22.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-pw-test-cluster-group
cancel-in-progress: false
env:
TPU_CLUSTER_NAME: pw-nightly-test-2-v4-8-nodepools
WORKLOAD_NAME: xpkpw-nightly-${{ github.run_attempt }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk dependencies
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create an Pathways-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create-pathways --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_NETWORK_ARGUMENTS}"
- name: Create test script to execute in workloads
run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh
- name: Run a Pathways workload on Ubuntu base image
run: python xpk.py workload create-pathways --cluster $TPU_CLUSTER_NAME --workload $WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --command "echo \"Hello world from a test script! \""
- name: Wait for Pathways workload completion and confirm it succeeded
run: python3 xpk.py workload list --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --wait-for-job-completion $WORKLOAD_NAME --timeout 300
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b --force
rc-cluster:
runs-on: [ubuntu-22.04]
concurrency: # We support one build test to run at a time currently.
group: nightly-rc-test-cluster-group
cancel-in-progress: false
env:
TPU_CLUSTER_NAME: rc-nightly-test-2-v4-8-nodepools
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: 'google-github-actions/auth@v2'
with:
credentials_json: '${{ secrets.GCP_SA_KEY }}'
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta,gke-gcloud-auth-plugin'
- name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands.
run: |
gcloud config set compute/zone us-east4-a
gcloud config get compute/zone
- name: Install xpk dependencies
run: |
make install
echo $PWD/bin >> "$GITHUB_PATH"
- name: Check xpk installation
run: xpk --help
- name: Create a RayCluster-enabled XPK Cluster with 2 x v4-8 nodepools
run: python xpk.py cluster create-ray --cluster $TPU_CLUSTER_NAME --tpu-type=v4-8 --num-slices=2 --zone=us-central2-b --ray-version=2.39.0 --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments='${{ secrets.CLUSTER_ARGUMENTS}}'
- name: Delete the RayCluster-enabled XPK cluster
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
set-variables:
runs-on: [ubuntu-22.04]
concurrency:
group: set-variables-${{ github.event.number}}
cancel-in-progress: true
outputs:
cluster-name: ${{ steps.set-cluster-name.outputs.cluster-name }}
cluster-name-dws: ${{ steps.set-cluster-name-dws.outputs.cluster-name-dws }}
group-name: ${{ steps.set-group-name.outputs.group-name }}
zone: ${{ steps.set-zone.outputs.zone }}
tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }}
tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }}
location: ${{steps.set-location.outputs.location}}
run-id: ${{steps.set-run-id.outputs.run-id}}
steps:
- name: set run-id
id: set-run-id
run: |
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
RUN_ID="dispatch"
elif [ "${{ github.ref }}" == "refs/heads/main" ]; then
RUN_ID="main"
elif [ "${{ github.ref }}" == "refs/heads/develop" ]; then
RUN_ID="develop"
else
RUN_ID="${{ github.event.number }}"
fi
echo run-id=$RUN_ID >> $GITHUB_OUTPUT
- name: set cluster-name
id: set-cluster-name
run: |
echo cluster-name=build-xpk-2-nodepools-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
- name: set cluster-name-dws
id: set-cluster-name-dws
run: |
echo cluster-name-dws=build-xpk-2-nodepools-dws-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
- name: set group-name
id: set-group-name
run: |
echo group-name=xpk-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
- name: set zone
id: set-zone
run: |
echo zone=us-central2-b >> $GITHUB_OUTPUT
- name: set tpu-type
id: set-tpu-type
run: |
echo tpu-type=v4-8 >> $GITHUB_OUTPUT
- name: set tpu-type-topology
id: set-tpu-type-topology
run: |
echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT
- name: set location
id: set-location
run: |
echo location=us-central2 >> $GITHUB_OUTPUT
install-dependencies:
needs: [set-variables]
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ["3.10", "3.11"]
steps:
- uses: actions/checkout@v4
- uses: google-github-actions/setup-gcloud@v2
with:
version: '>= 363.0.0'
install_components: 'beta, gke-gcloud-auth-plugin'
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Check if cache exists
id: check-cache
uses: actions/cache@v3
with:
path: |
usr/local/bin/
~/.cache/pip
${{env.pythonLocation}}
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
lookup-only: true
- name: install dependencies
if : steps.check-cache.outputs.cache-hit != 'true'
run: make install-dev && cp ./bin/kubectl-kueue /usr/local/bin/kubectl-kueue && cp ./bin/kubectl-kjob /usr/local/bin/kubectl-kjob
- name: Cache dependencies
if : steps.check-cache.outputs.cache-hit != 'true'
uses: actions/cache/save@v3
with:
path: |
/usr/local/bin/kubectl-kueue
/usr/local/bin/kubectl-kjob
~/.cache/pip
${{env.pythonLocation}}
key: xpk-deps-${{ matrix.python-version }}-${{github.run_id}}-${{github.run_attempt}}
run-integration-tests:
needs: [install-dependencies, set-variables]
uses: ./.github/workflows/reusable_integration_tests.yaml
with:
run-id: '${{needs.set-variables.outputs.run-id}}'
concurrency: # We support one build or nightly test to run at a time currently.
group: integration-tests-${{needs.set-variables.outputs.run-id}}
cancel-in-progress: true
secrets: inherit
cluster-private:
needs: [run-integration-tests, set-variables]
uses: ./.github/workflows/reusable_cluster_private.yaml
concurrency: # We support one build or nightly test to run at a time currently.
group: cluster-private-${{needs.set-variables.outputs.run-id}}
cancel-in-progress: true
with:
run-id: '${{needs.set-variables.outputs.run-id}}'
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
zone: '${{needs.set-variables.outputs.zone}}'
location: '${{needs.set-variables.outputs.location}}'
secrets: inherit
cluster-create:
needs: [run-integration-tests, set-variables]
concurrency: # We support one build or nightly test to run at a time currently.
group: cluster-create-${{needs.set-variables.outputs.run-id}}
cancel-in-progress: true
uses: ./.github/workflows/reusable_cluster_create.yaml
with:
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}'
zone: '${{needs.set-variables.outputs.zone}}'
location: '${{needs.set-variables.outputs.location}}'
run-id: '${{needs.set-variables.outputs.run-id}}'
secrets: inherit
workloads-tests:
needs: [cluster-create, set-variables]
uses: ./.github/workflows/reusable_workload_tests.yaml
concurrency: # We support one build or nightly test to run at a time currently.
group: workload-tests-${{needs.set-variables.outputs.run-id}}
cancel-in-progress: true
with:
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}}
zone: ${{needs.set-variables.outputs.zone}}
run-id: '${{needs.set-variables.outputs.run-id}}'
secrets: inherit
batch-tests:
needs: [cluster-create, set-variables]
uses: ./.github/workflows/reusable_batch_tests.yaml
concurrency: # We support one build or nightly test to run at a time currently.
group: batch-tests-${{needs.set-variables.outputs.run-id}}
cancel-in-progress: true
with:
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
zone: ${{needs.set-variables.outputs.zone}}
run-id: ${{needs.set-variables.outputs.run-id}}
secrets: inherit
storage-tests:
needs: [cluster-create, set-variables, batch-tests, workloads-tests]
uses: ./.github/workflows/reusable_storage_tests.yaml
concurrency: # We support one build or nightly test to run at a time currently.
group: storage-tests-${{needs.set-variables.outputs.run-id}}
cancel-in-progress: true
with:
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
zone: ${{needs.set-variables.outputs.zone}}
run-id: ${{needs.set-variables.outputs.run-id}}
secrets: inherit
cluster-delete:
if: always()
needs: [set-variables, storage-tests]
uses: ./.github/workflows/reusable_cluster_delete.yaml
with:
cluster-name-dws: ${{needs.set-variables.outputs.cluster-name-dws}}
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
run-id: ${{needs.set-variables.outputs.run-id}}
zone: ${{needs.set-variables.outputs.zone}}
secrets: inherit