waza/.github/workflows/skills-ci-example.yml at main · microsoft/waza · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
name: Skills CI Example - Waza Evaluation

# This is a template workflow for microsoft/skills repositories.
# Copy this file to your skill repository's .github/workflows/ directory
# and customize it for your skill.
#
# This example shows how to:
# 1. Install waza from source
# 2. Run evaluations with the mock executor (no API keys needed)
# 3. Upload results as artifacts
# 4. Use exit codes for CI pass/fail

on:
  # Trigger on pull requests
  pull_request:
    branches: [ main ]
    paths:
      - 'SKILL.md'
      - 'eval/**'
      - '.github/workflows/skills-ci-example.yml'

  # Trigger on pushes to main
  push:
    branches: [ main ]
    paths:
      - 'SKILL.md'
      - 'eval/**'

  # Allow manual trigger
  workflow_dispatch:
    inputs:
      eval-yaml:
        description: 'Path to evaluation YAML file'
        required: false
        type: string
        default: 'eval/eval.yaml'

permissions:
  contents: read

jobs:
  evaluate-skill:
    name: Evaluate Skill with Waza
    runs-on: ubuntu-latest

    steps:
      - name: Checkout Repository
        uses: actions/checkout@v4

      - name: Setup Go Environment
        uses: actions/setup-go@v5
        with:
          # Waza requires Go 1.26+
          go-version: '1.26'

      # Option 1: Install from source (recommended for CI)
      - name: Install Waza from Source
        run: |
          go install github.com/microsoft/waza/cmd/waza@latest
          waza --version

      # Option 2: Build from Dockerfile (alternative)
      # Uncomment this block if you prefer Docker-based builds
      # - name: Build Waza Docker Image
      #   run: |
      #     docker build -t waza:local .
      #     docker run waza:local --version

      - name: Determine Eval File
        id: eval-file
        run: |
          # Use workflow input if provided, otherwise default
          if [ -n "${{ inputs.eval-yaml }}" ]; then
            EVAL_FILE="${{ inputs.eval-yaml }}"
          else
            # Default location for skill evals
            EVAL_FILE="eval/eval.yaml"
          fi

          # Verify file exists
          if [ ! -f "$EVAL_FILE" ]; then
            echo "::error::Evaluation file not found: $EVAL_FILE"
            echo "Expected structure:"
            echo "  your-skill/"
            echo "  ├── SKILL.md"
            echo "  └── eval/"
            echo "      ├── eval.yaml"
            echo "      ├── tasks/"
            echo "      └── fixtures/"
            exit 1
          fi

          echo "eval-file=$EVAL_FILE" >> "$GITHUB_OUTPUT"
          echo "Using eval file: $EVAL_FILE"

      - name: Run Waza Evaluation
        id: run-eval
        run: |
          EVAL_FILE="${{ steps.eval-file.outputs.eval-file }}"

          # Run waza with mock executor (no API keys needed)
          # The mock executor simulates agent behavior for testing
          # Exit codes: 0=success, 1=test failure, 2=config error
          waza run "$EVAL_FILE" \
            --verbose \
            --output results.json

          # The workflow will fail if tests fail (exit code 1)
          # or if there's a configuration error (exit code 2)

      - name: Upload Results Artifact
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: waza-evaluation-results
          path: |
            results.json
            transcripts/
          retention-days: 30
          if-no-files-found: warn

      - name: Display Results Summary
        if: always()
        run: |
          if [ -f results.json ]; then
            echo "## Evaluation Results" >> "$GITHUB_STEP_SUMMARY"
            echo '```json' >> "$GITHUB_STEP_SUMMARY"
            head -50 results.json >> "$GITHUB_STEP_SUMMARY"
            echo '```' >> "$GITHUB_STEP_SUMMARY"
          fi

      - name: Check Evaluation Status
        if: steps.run-eval.outcome == 'failure'
        run: |
          echo "::error::Waza evaluation failed. Check the results artifact for details."
          exit 1

  # Optional: Test with Copilot SDK executor (requires GITHUB_TOKEN)
  # Uncomment this job if you want to test with actual AI models
  # evaluate-with-copilot:
  #   name: Evaluate with Copilot SDK
  #   runs-on: ubuntu-latest
  #   if: github.event_name == 'push' && github.ref == 'refs/heads/main'
  #
  #   steps:
  #     - name: Checkout Repository
  #       uses: actions/checkout@v4
  #
  #     - name: Setup Go Environment
  #       uses: actions/setup-go@v5
  #       with:
  #         go-version: '1.26'
  #
  #     - name: Install Waza
  #       run: |
  #         go install github.com/microsoft/waza/cmd/waza@latest
  #         waza --version
  #
  #     - name: Run Evaluation with Copilot
  #       env:
  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  #       run: |
  #         # Update eval.yaml to use copilot-sdk executor
  #         # Then run evaluation
  #         waza run eval/eval.yaml --verbose --output results-copilot.json
  #
  #     - name: Upload Copilot Results
  #       if: always()
  #       uses: actions/upload-artifact@v4
  #       with:
  #         name: waza-copilot-results
  #         path: results-copilot.json