Merge pull request #11 from All-Hands-AI/main

update
EcoSphereNetwork · Dec 21, 2024 · ed75313 · ed75313
2 parents e9134ff + 252c709
commit ed75313
Show file tree

Hide file tree

Showing 724 changed files with 28,520 additions and 39,981 deletions.
diff --git a/.devcontainer/README.MD b/.devcontainer/README.MD
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
diff --git a/.devcontainer/on_create.sh b/.devcontainer/on_create.sh
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -16,6 +16,9 @@ updates:
       chromadb:
         patterns:
           - "chromadb"
+      browsergym:
+        patterns:
+          - "browsergym*"
       security-all:
         applies-to: "security-updates"
         patterns:

diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml
@@ -1,10 +1,8 @@
-name: Run Evaluation
+name: Run SWE-Bench Evaluation
 
 on:
   pull_request:
     types: [labeled]
-  schedule:
-    - cron: "0 1 * * *" # Run daily at 1 AM UTC
   workflow_dispatch:
     inputs:
       reason:
@@ -60,24 +58,6 @@ jobs:
           echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
 
-      - name: Run integration test evaluation
-        env:
-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
-          RUNTIME: remote
-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
-          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
-
-        run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
-
-          # get evaluation report
-          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
-          echo "REPORT_FILE: $REPORT_FILE"
-          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
-          cat $REPORT_FILE >> $GITHUB_ENV
-          echo >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
       - name: Run SWE-Bench evaluation
         env:
           ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
@@ -86,12 +66,12 @@ jobs:
           EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
 
         run: |
-          poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
           OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
           echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
-          poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
 
-          poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
+          poetry run ./evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
           echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
           cat summarize_outputs.log >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
@@ -145,9 +125,6 @@ jobs:
               **SWE-Bench Evaluation Report**
               ${{ env.SWEBENCH_REPORT }}
               ---
-              **Integration Tests Evaluation Report**
-              ${{ env.INTEGRATION_TEST_REPORT }}
-              ---
               You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
 
       - name: Post to a Slack channel

diff --git a/.github/workflows/fe-unit-tests.yml b/.github/workflows/fe-unit-tests.yml
@@ -24,7 +24,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        node-version: [20]
+        node-version: [20, 22]
+      fail-fast: true
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -35,6 +36,9 @@ jobs:
       - name: Install dependencies
         working-directory: ./frontend
         run: npm ci
+      - name: Run TypeScript compilation
+        working-directory: ./frontend
+        run: npm run make-i18n && tsc
       - name: Run tests and collect coverage
         working-directory: ./frontend
         run: npm run test:coverage

diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
@@ -291,7 +291,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -368,7 +368,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -0,0 +1,158 @@
+name: Run Integration Tests
+
+on:
+  pull_request:
+    types: [labeled]
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: 'Reason for manual trigger'
+        required: true
+        default: ''
+  schedule:
+    - cron: '30 22 * * *'  # Runs at 10:30pm UTC every day
+
+env:
+  N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
+
+jobs:
+  run-integration-tests:
+    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: "read"
+      id-token: "write"
+      pull-requests: "write"
+      issues: "write"
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+
+      - name: Comment on PR if 'integration-test' label is present
+        if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          unique: false
+          comment: |
+            Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
+
+      - name: Install Python dependencies using Poetry
+        run: poetry install --without evaluation,llama-index
+
+      - name: Configure config.toml for testing with Haiku
+        env:
+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Build environment
+        run: make build
+
+      - name: Run integration test evaluation for Haiku
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
+
+          # get integration tests report
+          REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE_HAIKU"
+          echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Wait a little bit
+        run: sleep 10
+
+      - name: Configure config.toml for testing with DeepSeek
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DeepSeek
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
+
+          # get integration tests report
+          REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Create archive of evaluation outputs
+        run: |
+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/*  # Only include the actual result directories
+
+      - name: Upload evaluation results as artifact
+        uses: actions/upload-artifact@v4
+        id: upload_results_artifact
+        with:
+          name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
+          path: integration_tests_*.tar.gz
+
+      - name: Get artifact URLs
+        run: |
+          echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+
+      - name: Set timestamp and trigger reason
+        run: |
+          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
+          else
+            echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
+          fi
+
+      - name: Comment with results and artifact link
+        id: create_comment
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          # if triggered by PR, use PR number, otherwise use 5318 as fallback issue number for manual triggers
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5318 }}
+          unique: false
+          comment: |
+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
+              Commit: ${{ github.sha }}
+              **Integration Tests Report (Haiku)**
+              Haiku LLM Test Results:
+              ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
+              ---
+              **Integration Tests Report (DeepSeek)**
+              DeepSeek LLM Test Results:
+              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+              Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
diff --git a/.github/workflows/lint-fix.yml b/.github/workflows/lint-fix.yml
@@ -0,0 +1,91 @@
+name: Lint Fix
+
+on:
+  pull_request:
+    types: [labeled]
+
+jobs:
+  # Frontend lint fixes
+  lint-fix-frontend:
+    if: github.event.label.name == 'lint-fix'
+    name: Fix frontend linting issues
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install Node.js 20
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+      - name: Install frontend dependencies
+        run: |
+          cd frontend
+          npm install --frozen-lockfile
+      - name: Fix frontend lint issues
+        run: |
+          cd frontend
+          npm run lint:fix
+
+      # Commit and push changes if any
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --quiet || echo "changes=true" >> $GITHUB_OUTPUT
+      - name: Commit and push if there are changes
+        if: steps.git-check.outputs.changes == 'true'
+        run: |
+          git config --local user.email "openhands@all-hands.dev"
+          git config --local user.name "OpenHands Bot"
+          git add -A
+          git commit -m "🤖 Auto-fix frontend linting issues"
+          git push
+
+  # Python lint fixes
+  lint-fix-python:
+    if: github.event.label.name == 'lint-fix'
+    name: Fix Python linting issues
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+          cache: 'pip'
+      - name: Install pre-commit
+        run: pip install pre-commit==3.7.0
+      - name: Fix python lint issues
+        run: |
+          # Run all pre-commit hooks and continue even if they modify files (exit code 1)
+          pre-commit run --config ./dev_config/python/.pre-commit-config.yaml --files openhands/**/* evaluation/**/* tests/**/* || true
+
+      # Commit and push changes if any
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --quiet || echo "changes=true" >> $GITHUB_OUTPUT
+      - name: Commit and push if there are changes
+        if: steps.git-check.outputs.changes == 'true'
+        run: |
+          git config --local user.email "openhands@all-hands.dev"
+          git config --local user.name "OpenHands Bot"
+          git add -A
+          git commit -m "🤖 Auto-fix Python linting issues"
+          git push
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -30,10 +30,11 @@ jobs:
         run: |
           cd frontend
           npm install --frozen-lockfile
-      - name: Lint
+      - name: Lint and TypeScript compilation
         run: |
           cd frontend
           npm run lint
+          npm run make-i18n && tsc
 
   # Run lint on the python code
   lint-python: