``` ├── .all-contributorsrc ├── .cursorignore ├── .dockerignore ├── .github/ ├── workflows/ ├── ci-lume.yml ├── publish-agent.yml ├── publish-computer-server.yml ├── publish-computer.yml ├── publish-core.yml ├── publish-lume.yml ├── publish-mcp-server.yml ├── publish-pylume.yml ├── publish-som.yml ├── reusable-publish.yml ├── .gitignore ├── .vscode/ ├── launch.json ├── lume.code-workspace ├── py.code-workspace ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE.md ├── README.md ├── docs/ ├── Developer-Guide.md ├── FAQ.md ├── Telemetry.md ├── examples/ ├── agent_examples.py ├── agent_ui_examples.py ├── computer_examples.py ├── computer_ui_examples.py ├── pylume_examples.py ├── som_examples.py ├── utils.py ├── img/ ├── agent.png ├── agent_gradio_ui.png ├── cli.png ├── computer.png ├── logo_black.png ├── logo_white.png ├── libs/ ├── agent/ ├── README.md ├── agent/ ├── __init__.py ├── core/ ├── __init__.py ├── agent.py ├── base.py ├── callbacks.py ├── experiment.py ├── factory.py ├── messages.py ├── provider_config.py ├── telemetry.py ├── tools.py ├── tools/ ├── __init__.py ├── base.py ├── bash.py ├── collection.py ├── computer.py ├── edit.py ├── manager.py ├── types.py ├── visualization.py ``` ## /.all-contributorsrc ```all-contributorsrc path="/.all-contributorsrc" { "projectName": "cua", "projectOwner": "trycua", "files": [ "README.md" ], "commitType": "docs", "commitConvention": "angular", "contributorsPerLine": 7, "contributors": [ { "login": "f-trycua", "name": "f-trycua", "avatar_url": "https://avatars.githubusercontent.com/u/195596869?v=4", "profile": "https://github.com/f-trycua", "contributions": [ "code" ] }, { "login": "pepicrft", "name": "Pedro Piñera Buendía", "avatar_url": "https://avatars.githubusercontent.com/u/663605?v=4", "profile": "http://pepicrft.me", "contributions": [ "code" ] }, { "login": "aktech", "name": "Amit Kumar", "avatar_url": "https://avatars.githubusercontent.com/u/5647941?v=4", "profile": "https://iamit.in", "contributions": [ "code" ] }, { "login": "jellydn", "name": "Dung Duc Huynh (Kaka)", "avatar_url": "https://avatars.githubusercontent.com/u/870029?v=4", "profile": "https://productsway.com/", "contributions": [ "code" ] }, { "login": "ShrootBuck", "name": "Zayd Krunz", "avatar_url": "https://avatars.githubusercontent.com/u/70227235?v=4", "profile": "http://zaydkrunz.com", "contributions": [ "code" ] }, { "login": "PrashantRaj18198", "name": "Prashant Raj", "avatar_url": "https://avatars.githubusercontent.com/u/23168997?v=4", "profile": "https://github.com/PrashantRaj18198", "contributions": [ "code" ] }, { "login": "Leland-Takamine", "name": "Leland Takamine", "avatar_url": "https://avatars.githubusercontent.com/u/847683?v=4", "profile": "https://www.mobile.dev", "contributions": [ "code" ] }, { "login": "ddupont808", "name": "ddupont", "avatar_url": "https://avatars.githubusercontent.com/u/3820588?v=4", "profile": "https://github.com/ddupont808", "contributions": [ "code" ] }, { "login": "Lizzard1123", "name": "Ethan Gutierrez", "avatar_url": "https://avatars.githubusercontent.com/u/46036335?v=4", "profile": "https://github.com/Lizzard1123", "contributions": [ "code" ] }, { "login": "RicterZ", "name": "Ricter Zheng", "avatar_url": "https://avatars.githubusercontent.com/u/5282759?v=4", "profile": "https://ricterz.me", "contributions": [ "code" ] }, { "login": "rahulkarajgikar", "name": "Rahul Karajgikar", "avatar_url": "https://avatars.githubusercontent.com/u/50844303?v=4", "profile": "https://www.trytruffle.ai/", "contributions": [ "code" ] }, { "login": "trospix", "name": "trospix", "avatar_url": "https://avatars.githubusercontent.com/u/81363696?v=4", "profile": "https://github.com/trospix", "contributions": [ "code" ] }, { "login": "eltociear", "name": "Ikko Eltociear Ashimine", "avatar_url": "https://avatars.githubusercontent.com/u/22633385?v=4", "profile": "https://wavee.world/invitation/b96d00e6-b802-4a1b-8a66-2e3854a01ffd", "contributions": [ "code" ] }, { "login": "dp221125", "name": "한석호(MilKyo)", "avatar_url": "https://avatars.githubusercontent.com/u/10572119?v=4", "profile": "https://github.com/dp221125", "contributions": [ "code" ] }, { "login": "rahimnathwani", "name": "Rahim Nathwani", "avatar_url": "https://avatars.githubusercontent.com/u/891558?v=4", "profile": "https://www.encona.com/", "contributions": [ "code" ] }, { "login": "mjspeck", "name": "Matt Speck", "avatar_url": "https://avatars.githubusercontent.com/u/20689127?v=4", "profile": "https://mjspeck.github.io/", "contributions": [ "code" ] } ] } ``` ## /.cursorignore ```cursorignore path="/.cursorignore" # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ !libs/lume/scripts/build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Scripts server/scripts/ # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # Ruff stuff: .ruff_cache/ # PyPI configuration file .pypirc # Conda .conda/ # Local environment .env.local # macOS DS_Store .DS_Store weights/ weights/icon_detect/ weights/icon_detect/model.pt weights/icon_detect/model.pt.zip weights/icon_detect/model.pt.zip.part* libs/omniparser/weights/icon_detect/model.pt # Example test data and output examples/test_data/ examples/output/ /screenshots/ /experiments/ /logs/ # Xcode # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore ## User settings xcuserdata/ ## Obj-C/Swift specific *.hmap ## App packaging *.ipa *.dSYM.zip *.dSYM ## Playgrounds timeline.xctimeline playground.xcworkspace # Swift Package Manager # # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. # Packages/ # Package.pins # Package.resolved # *.xcodeproj # # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata # hence it is not needed unless you have added a package configuration file to your project .swiftpm/ .build/ # CocoaPods # # We recommend against adding the Pods directory to your .gitignore. However # you should judge for yourself, the pros and cons are mentioned at: # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control # # Pods/ # # Add this line if you want to avoid checking in source code from the Xcode workspace # *.xcworkspace # Carthage # # Add this line if you want to avoid checking in source code from Carthage dependencies. # Carthage/Checkouts Carthage/Build/ # fastlane # # It is recommended to not store the screenshots in the git repo. # Instead, use fastlane to re-generate the screenshots whenever they are needed. # For more information about the recommended setup visit: # https://docs.fastlane.tools/best-practices/source-control/#source-control fastlane/report.xml fastlane/Preview.html fastlane/screenshots/**/*.png fastlane/test_output # Ignore folder ignore # .release .release/ ``` ## /.dockerignore ```dockerignore path="/.dockerignore" # Version control .git .github .gitignore # Environment and cache .venv .env .env.local __pycache__ *.pyc *.pyo *.pyd .Python .pytest_cache .pdm-build # Distribution / packaging dist build *.egg-info # Development .vscode .idea *.swp *.swo # Docs docs/site # Notebooks notebooks/.ipynb_checkpoints # Docker Dockerfile .dockerignore ``` ## /.github/workflows/ci-lume.yml ```yml path="/.github/workflows/ci-lume.yml" name: lume on: push: branches: - "main" pull_request: {} concurrency: group: lume-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true # Runner images: https://github.com/actions/runner-images jobs: test: name: Test runs-on: macos-15 steps: - uses: actions/checkout@v4 - run: uname -a - run: sudo xcode-select -s /Applications/Xcode_16.app # Swift 6.0 - run: swift test working-directory: ./libs/lume build: name: Release build runs-on: macos-15 steps: - uses: actions/checkout@v4 - run: uname -a - run: sudo xcode-select -s /Applications/Xcode_16.app # Swift 6.0 - run: swift build --configuration release working-directory: ./libs/lume ``` ## /.github/workflows/publish-agent.yml ```yml path="/.github/workflows/publish-agent.yml" name: Publish Agent Package on: push: tags: - 'agent-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} computer_version: ${{ steps.update-deps.outputs.computer_version }} som_version: ${{ steps.update-deps.outputs.som_version }} core_version: ${{ steps.update-deps.outputs.core_version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for agent" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' - name: Update dependencies to latest versions id: update-deps run: | cd libs/agent # Install required package for PyPI API access pip install requests # Create a more robust Python script for PyPI version checking cat > get_latest_versions.py << 'EOF' import requests import json import sys def get_package_version(package_name, fallback="0.1.0"): try: response = requests.get(f'https://pypi.org/pypi/{package_name}/json') print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) if response.status_code != 200: print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) return fallback data = json.loads(response.text) if 'info' not in data: print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) return fallback return data['info']['version'] except Exception as e: print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) return fallback # Get latest versions print(get_package_version('cua-computer')) print(get_package_version('cua-som')) print(get_package_version('cua-core')) EOF # Execute the script to get the versions VERSIONS=($(python get_latest_versions.py)) LATEST_COMPUTER=${VERSIONS[0]} LATEST_SOM=${VERSIONS[1]} LATEST_CORE=${VERSIONS[2]} echo "Latest cua-computer version: $LATEST_COMPUTER" echo "Latest cua-som version: $LATEST_SOM" echo "Latest cua-core version: $LATEST_CORE" # Output the versions for the next job echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT echo "som_version=$LATEST_SOM" >> $GITHUB_OUTPUT echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT # Determine major version for version constraint COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1) SOM_MAJOR=$(echo $LATEST_SOM | cut -d. -f1) CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1) NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1)) NEXT_SOM_MAJOR=$((SOM_MAJOR + 1)) NEXT_CORE_MAJOR=$((CORE_MAJOR + 1)) # Update dependencies in pyproject.toml if [[ "$OSTYPE" == "darwin"* ]]; then # macOS version of sed needs an empty string for -i sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml sed -i '' "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml else # Linux version sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml sed -i "s/\"cua-som>=.*,<.*\"/\"cua-som>=$LATEST_SOM,<$NEXT_SOM_MAJOR.0.0\"/" pyproject.toml sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml fi # Display the updated dependencies echo "Updated dependencies in pyproject.toml:" grep -E "cua-computer|cua-som|cua-core" pyproject.toml publish: needs: prepare uses: ./.github/workflows/reusable-publish.yml with: package_name: "agent" package_dir: "libs/agent" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-agent" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} set-env-variables: needs: [prepare, publish] runs-on: macos-latest steps: - name: Set environment variables for use in other jobs run: | echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV echo "SOM_VERSION=${{ needs.prepare.outputs.som_version }}" >> $GITHUB_ENV echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV ``` ## /.github/workflows/publish-computer-server.yml ```yml path="/.github/workflows/publish-computer-server.yml" name: Publish Computer Server Package on: push: tags: - 'computer-server-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string outputs: version: description: "The version that was published" value: ${{ jobs.prepare.outputs.version }} # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/computer-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for computer-server" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' publish: needs: prepare uses: ./.github/workflows/reusable-publish.yml with: package_name: "computer-server" package_dir: "libs/computer-server" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-computer-server" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} set-env-variables: needs: [prepare, publish] runs-on: macos-latest steps: - name: Set environment variables for use in other jobs run: | echo "COMPUTER_VERSION=${{ needs.prepare.outputs.version }}" >> $GITHUB_ENV ``` ## /.github/workflows/publish-computer.yml ```yml path="/.github/workflows/publish-computer.yml" name: Publish Computer Package on: push: tags: - 'computer-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} pylume_version: ${{ steps.update-deps.outputs.pylume_version }} core_version: ${{ steps.update-deps.outputs.core_version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/computer-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for computer" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' - name: Update dependencies to latest versions id: update-deps run: | cd libs/computer # Install required package for PyPI API access pip install requests # Create a more robust Python script for PyPI version checking cat > get_latest_versions.py << 'EOF' import requests import json import sys def get_package_version(package_name, fallback="0.1.0"): try: response = requests.get(f'https://pypi.org/pypi/{package_name}/json') print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) if response.status_code != 200: print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) return fallback data = json.loads(response.text) if 'info' not in data: print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) return fallback return data['info']['version'] except Exception as e: print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) return fallback # Get latest versions print(get_package_version('pylume')) print(get_package_version('cua-core')) EOF # Execute the script to get the versions VERSIONS=($(python get_latest_versions.py)) LATEST_PYLUME=${VERSIONS[0]} LATEST_CORE=${VERSIONS[1]} echo "Latest pylume version: $LATEST_PYLUME" echo "Latest cua-core version: $LATEST_CORE" # Output the versions for the next job echo "pylume_version=$LATEST_PYLUME" >> $GITHUB_OUTPUT echo "core_version=$LATEST_CORE" >> $GITHUB_OUTPUT # Determine major version for version constraint CORE_MAJOR=$(echo $LATEST_CORE | cut -d. -f1) NEXT_CORE_MAJOR=$((CORE_MAJOR + 1)) # Update dependencies in pyproject.toml if [[ "$OSTYPE" == "darwin"* ]]; then # macOS version of sed needs an empty string for -i sed -i '' "s/\"pylume>=.*\"/\"pylume>=$LATEST_PYLUME\"/" pyproject.toml sed -i '' "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml else # Linux version sed -i "s/\"pylume>=.*\"/\"pylume>=$LATEST_PYLUME\"/" pyproject.toml sed -i "s/\"cua-core>=.*,<.*\"/\"cua-core>=$LATEST_CORE,<$NEXT_CORE_MAJOR.0.0\"/" pyproject.toml fi # Display the updated dependencies echo "Updated dependencies in pyproject.toml:" grep -E "pylume|cua-core" pyproject.toml publish: needs: prepare uses: ./.github/workflows/reusable-publish.yml with: package_name: "computer" package_dir: "libs/computer" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-computer" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} set-env-variables: needs: [prepare, publish] runs-on: macos-latest steps: - name: Set environment variables for use in other jobs run: | echo "PYLUME_VERSION=${{ needs.prepare.outputs.pylume_version }}" >> $GITHUB_ENV echo "CORE_VERSION=${{ needs.prepare.outputs.core_version }}" >> $GITHUB_ENV ``` ## /.github/workflows/publish-core.yml ```yml path="/.github/workflows/publish-core.yml" name: Publish Core Package on: push: tags: - 'core-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/core-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for core" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT publish: needs: prepare uses: ./.github/workflows/reusable-publish.yml with: package_name: "core" package_dir: "libs/core" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-core" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} ``` ## /.github/workflows/publish-lume.yml ```yml path="/.github/workflows/publish-lume.yml" name: Publish Notarized Lume on: push: tags: - 'lume-v*' workflow_dispatch: inputs: version: description: 'Version to notarize (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to notarize' required: true type: string secrets: APPLICATION_CERT_BASE64: required: true INSTALLER_CERT_BASE64: required: true CERT_PASSWORD: required: true APPLE_ID: required: true TEAM_ID: required: true APP_SPECIFIC_PASSWORD: required: true DEVELOPER_NAME: required: true permissions: contents: write env: APPLICATION_CERT_BASE64: ${{ secrets.APPLICATION_CERT_BASE64 }} INSTALLER_CERT_BASE64: ${{ secrets.INSTALLER_CERT_BASE64 }} CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} APPLE_ID: ${{ secrets.APPLE_ID }} TEAM_ID: ${{ secrets.TEAM_ID }} APP_SPECIFIC_PASSWORD: ${{ secrets.APP_SPECIFIC_PASSWORD }} DEVELOPER_NAME: ${{ secrets.DEVELOPER_NAME }} jobs: notarize: runs-on: macos-15 outputs: sha256_checksums: ${{ steps.generate_checksums.outputs.checksums }} version: ${{ steps.set_version.outputs.version }} steps: - uses: actions/checkout@v4 - name: Select Xcode 16 run: | sudo xcode-select -s /Applications/Xcode_16.app xcodebuild -version - name: Install dependencies run: | brew install cpio - name: Create .release directory run: mkdir -p .release - name: Set version id: set_version run: | # Determine version from tag or input if [[ "$GITHUB_REF" == refs/tags/lume-v* ]]; then VERSION="${GITHUB_REF#refs/tags/lume-v}" echo "Using version from tag: $VERSION" elif [[ -n "${{ inputs.version }}" ]]; then VERSION="${{ inputs.version }}" echo "Using version from input: $VERSION" elif [[ -n "${{ inputs.version }}" ]]; then VERSION="${{ inputs.version }}" echo "Using version from workflow_call input: $VERSION" else echo "Error: No version found in tag or input" exit 1 fi # Update version in Main.swift echo "Updating version in Main.swift to $VERSION" sed -i '' "s/static let current: String = \".*\"/static let current: String = \"$VERSION\"/" libs/lume/src/Main.swift # Set output for later steps echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Import Certificates env: APPLICATION_CERT_BASE64: ${{ secrets.APPLICATION_CERT_BASE64 }} INSTALLER_CERT_BASE64: ${{ secrets.INSTALLER_CERT_BASE64 }} CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} KEYCHAIN_PASSWORD: "temp_password" run: | # Create a temporary keychain security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain security default-keychain -s build.keychain security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain security set-keychain-settings -t 3600 -l build.keychain # Import certificates echo $APPLICATION_CERT_BASE64 | base64 --decode > application.p12 echo $INSTALLER_CERT_BASE64 | base64 --decode > installer.p12 # Import certificates silently (minimize output) security import application.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/pkgbuild > /dev/null 2>&1 security import installer.p12 -k build.keychain -P "$CERT_PASSWORD" -T /usr/bin/codesign -T /usr/bin/pkgbuild > /dev/null 2>&1 # Allow codesign to access the certificates (minimal output) security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain > /dev/null 2>&1 # Verify certificates were imported but only show count, not details echo "Verifying signing identity (showing count only)..." security find-identity -v -p codesigning | grep -c "valid identities found" || true # Clean up certificate files rm application.p12 installer.p12 - name: Build and Notarize id: build_notarize env: APPLE_ID: ${{ secrets.APPLE_ID }} TEAM_ID: ${{ secrets.TEAM_ID }} APP_SPECIFIC_PASSWORD: ${{ secrets.APP_SPECIFIC_PASSWORD }} # These will now reference the imported certificates CERT_APPLICATION_NAME: "Developer ID Application: ${{ secrets.DEVELOPER_NAME }} (${{ secrets.TEAM_ID }})" CERT_INSTALLER_NAME: "Developer ID Installer: ${{ secrets.DEVELOPER_NAME }} (${{ secrets.TEAM_ID }})" VERSION: ${{ steps.set_version.outputs.version }} working-directory: ./libs/lume run: | # Minimal debug information echo "Starting build process..." echo "Swift version: $(swift --version | head -n 1)" echo "Building version: $VERSION" # Ensure .release directory exists mkdir -p .release chmod 755 .release # Build the project first (redirect verbose output) echo "Building project..." swift build --configuration release > build.log 2>&1 echo "Build completed." # Run the notarization script with LOG_LEVEL env var chmod +x scripts/build/build-release-notarized.sh cd scripts/build LOG_LEVEL=minimal ./build-release-notarized.sh # Return to the lume directory cd ../.. # Debug: List what files were actually created echo "Files in .release directory:" find .release -type f -name "*.tar.gz" -o -name "*.pkg.tar.gz" # Get architecture for output filename ARCH=$(uname -m) OS_IDENTIFIER="darwin-${ARCH}" # Output paths for later use echo "tarball_path=.release/lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" >> $GITHUB_OUTPUT echo "pkg_path=.release/lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" >> $GITHUB_OUTPUT - name: Generate SHA256 Checksums id: generate_checksums working-directory: ./libs/lume/.release run: | # Use existing checksums file if it exists, otherwise generate one if [ -f "checksums.txt" ]; then echo "Using existing checksums file" cat checksums.txt else echo "## SHA256 Checksums" > checksums.txt echo '\`\`\`' >> checksums.txt shasum -a 256 lume-*.tar.gz >> checksums.txt echo '\`\`\`' >> checksums.txt fi checksums=$(cat checksums.txt) echo "checksums<> $GITHUB_OUTPUT echo "$checksums" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT # Debug: Show all files in the release directory echo "All files in release directory:" ls -la - name: Create Standard Version Releases working-directory: ./libs/lume/.release run: | VERSION=${{ steps.set_version.outputs.version }} ARCH=$(uname -m) OS_IDENTIFIER="darwin-${ARCH}" # Create OS-tagged symlinks ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" "lume-darwin.tar.gz" ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" "lume-darwin.pkg.tar.gz" # Create simple symlinks ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.tar.gz" "lume.tar.gz" ln -sf "lume-${VERSION}-${OS_IDENTIFIER}.pkg.tar.gz" "lume.pkg.tar.gz" # List all files (including symlinks) echo "Files with symlinks in release directory:" ls -la - name: Upload Notarized Package (Tarball) uses: actions/upload-artifact@v4 with: name: lume-notarized-tarball path: ./libs/lume/${{ steps.build_notarize.outputs.tarball_path }} if-no-files-found: error - name: Upload Notarized Package (Installer) uses: actions/upload-artifact@v4 with: name: lume-notarized-installer path: ./libs/lume/${{ steps.build_notarize.outputs.pkg_path }} if-no-files-found: error - name: Create Release if: startsWith(github.ref, 'refs/tags/lume-v') uses: softprops/action-gh-release@v1 with: files: | ./libs/lume/${{ steps.build_notarize.outputs.tarball_path }} ./libs/lume/${{ steps.build_notarize.outputs.pkg_path }} ./libs/lume/.release/lume-darwin.tar.gz ./libs/lume/.release/lume-darwin.pkg.tar.gz ./libs/lume/.release/lume.tar.gz ./libs/lume/.release/lume.pkg.tar.gz body: | ${{ steps.generate_checksums.outputs.checksums }} ### Installation with script /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" \`\`\` generate_release_notes: true make_latest: true ``` ## /.github/workflows/publish-mcp-server.yml ```yml path="/.github/workflows/publish-mcp-server.yml" name: Publish MCP Server Package on: push: tags: - 'mcp-server-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string outputs: version: description: "The version that was published" value: ${{ jobs.prepare.outputs.version }} # Adding permissions at workflow level permissions: contents: write jobs: prepare: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} agent_version: ${{ steps.update-deps.outputs.agent_version }} computer_version: ${{ steps.update-deps.outputs.computer_version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/mcp-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for mcp-server" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' - name: Update dependencies to latest versions id: update-deps run: | cd libs/mcp-server # Install required package for PyPI API access pip install requests # Create a Python script for PyPI version checking cat > get_latest_versions.py << 'EOF' import requests import json import sys def get_package_version(package_name, fallback="0.1.0"): try: response = requests.get(f'https://pypi.org/pypi/{package_name}/json') print(f"API Response Status for {package_name}: {response.status_code}", file=sys.stderr) if response.status_code != 200: print(f"API request failed for {package_name}, using fallback version", file=sys.stderr) return fallback data = json.loads(response.text) if 'info' not in data: print(f"Missing 'info' key in API response for {package_name}, using fallback version", file=sys.stderr) return fallback return data['info']['version'] except Exception as e: print(f"Error fetching version for {package_name}: {str(e)}", file=sys.stderr) return fallback # Get latest versions print(get_package_version('cua-agent')) print(get_package_version('cua-computer')) EOF # Execute the script to get the versions VERSIONS=($(python get_latest_versions.py)) LATEST_AGENT=${VERSIONS[0]} LATEST_COMPUTER=${VERSIONS[1]} echo "Latest cua-agent version: $LATEST_AGENT" echo "Latest cua-computer version: $LATEST_COMPUTER" # Output the versions for the next job echo "agent_version=$LATEST_AGENT" >> $GITHUB_OUTPUT echo "computer_version=$LATEST_COMPUTER" >> $GITHUB_OUTPUT # Determine major version for version constraint AGENT_MAJOR=$(echo $LATEST_AGENT | cut -d. -f1) COMPUTER_MAJOR=$(echo $LATEST_COMPUTER | cut -d. -f1) NEXT_AGENT_MAJOR=$((AGENT_MAJOR + 1)) NEXT_COMPUTER_MAJOR=$((COMPUTER_MAJOR + 1)) # Update dependencies in pyproject.toml if [[ "$OSTYPE" == "darwin"* ]]; then # macOS version of sed needs an empty string for -i # Update cua-agent with all extras sed -i '' "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml sed -i '' "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml else # Linux version sed -i "s/\"cua-agent\[all\]>=.*,<.*\"/\"cua-agent[all]>=$LATEST_AGENT,<$NEXT_AGENT_MAJOR.0.0\"/" pyproject.toml sed -i "s/\"cua-computer>=.*,<.*\"/\"cua-computer>=$LATEST_COMPUTER,<$NEXT_COMPUTER_MAJOR.0.0\"/" pyproject.toml fi # Display the updated dependencies echo "Updated dependencies in pyproject.toml:" grep -E "cua-agent|cua-computer" pyproject.toml publish: needs: prepare uses: ./.github/workflows/reusable-publish.yml with: package_name: "mcp-server" package_dir: "libs/mcp-server" version: ${{ needs.prepare.outputs.version }} is_lume_package: false base_package_name: "cua-mcp-server" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} set-env-variables: needs: [prepare, publish] runs-on: macos-latest steps: - name: Set environment variables for use in other jobs run: | echo "AGENT_VERSION=${{ needs.prepare.outputs.agent_version }}" >> $GITHUB_ENV echo "COMPUTER_VERSION=${{ needs.prepare.outputs.computer_version }}" >> $GITHUB_ENV ``` ## /.github/workflows/publish-pylume.yml ```yml path="/.github/workflows/publish-pylume.yml" name: Publish Pylume Package on: push: tags: - 'pylume-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string outputs: version: description: "The version that was published" value: ${{ jobs.determine-version.outputs.version }} # Adding permissions at workflow level permissions: contents: write jobs: determine-version: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/pylume-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for pylume" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT validate-version: runs-on: macos-latest needs: determine-version steps: - uses: actions/checkout@v4 - name: Validate version id: validate-version run: | CODE_VERSION=$(grep '__version__' libs/pylume/pylume/__init__.py | cut -d'"' -f2) if [ "${{ needs.determine-version.outputs.version }}" != "$CODE_VERSION" ]; then echo "Version mismatch: expected $CODE_VERSION, got ${{ needs.determine-version.outputs.version }}" exit 1 fi echo "Version validated: $CODE_VERSION" publish: needs: determine-version uses: ./.github/workflows/reusable-publish.yml with: package_name: "pylume" package_dir: "libs/pylume" version: ${{ needs.determine-version.outputs.version }} is_lume_package: true base_package_name: "pylume" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} ``` ## /.github/workflows/publish-som.yml ```yml path="/.github/workflows/publish-som.yml" name: Publish SOM Package on: push: tags: - 'som-v*' workflow_dispatch: inputs: version: description: 'Version to publish (without v prefix)' required: true default: '0.1.0' workflow_call: inputs: version: description: 'Version to publish' required: true type: string outputs: version: description: "The version that was published" value: ${{ jobs.determine-version.outputs.version }} # Adding permissions at workflow level permissions: contents: write jobs: determine-version: runs-on: macos-latest outputs: version: ${{ steps.get-version.outputs.version }} steps: - uses: actions/checkout@v4 - name: Determine version id: get-version run: | if [ "${{ github.event_name }}" == "push" ]; then # Extract version from tag (for package-specific tags) if [[ "${{ github.ref }}" =~ ^refs/tags/som-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then VERSION=${BASH_REMATCH[1]} else echo "Invalid tag format for som" exit 1 fi elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Use version from workflow dispatch VERSION=${{ github.event.inputs.version }} else # Use version from workflow_call VERSION=${{ inputs.version }} fi echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT publish: needs: determine-version uses: ./.github/workflows/reusable-publish.yml with: package_name: "som" package_dir: "libs/som" version: ${{ needs.determine-version.outputs.version }} is_lume_package: false base_package_name: "cua-som" secrets: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} ``` ## /.github/workflows/reusable-publish.yml ```yml path="/.github/workflows/reusable-publish.yml" name: Reusable Package Publish Workflow on: workflow_call: inputs: package_name: description: 'Name of the package (e.g. pylume, computer, agent)' required: true type: string package_dir: description: 'Directory containing the package relative to workspace root (e.g. libs/pylume)' required: true type: string version: description: 'Version to publish' required: true type: string is_lume_package: description: 'Whether this package includes the lume binary' required: false type: boolean default: false base_package_name: description: 'PyPI package name (e.g. pylume, cua-agent)' required: true type: string make_latest: description: 'Whether to mark this release as latest (should only be true for lume)' required: false type: boolean default: false secrets: PYPI_TOKEN: required: true outputs: version: description: "The version that was published" value: ${{ jobs.build-and-publish.outputs.version }} jobs: build-and-publish: runs-on: macos-latest permissions: contents: write # This permission is needed for creating releases outputs: version: ${{ steps.set-version.outputs.version }} steps: - uses: actions/checkout@v4 with: fetch-depth: 0 # Full history for release creation - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.10' - name: Create root pdm.lock file run: | # Create an empty pdm.lock file in the root touch pdm.lock - name: Install PDM uses: pdm-project/setup-pdm@v3 with: python-version: '3.10' cache: true - name: Set version id: set-version run: | echo "VERSION=${{ inputs.version }}" >> $GITHUB_ENV echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT - name: Initialize PDM in package directory run: | # Make sure we're working with a properly initialized PDM project cd ${{ inputs.package_dir }} # Create pdm.lock if it doesn't exist if [ ! -f "pdm.lock" ]; then echo "No pdm.lock found, initializing PDM project..." pdm lock fi - name: Set version in package run: | cd ${{ inputs.package_dir }} # Replace pdm bump with direct edit of pyproject.toml if [[ "$OSTYPE" == "darwin"* ]]; then # macOS version of sed needs an empty string for -i sed -i '' "s/version = \".*\"/version = \"$VERSION\"/" pyproject.toml else # Linux version sed -i "s/version = \".*\"/version = \"$VERSION\"/" pyproject.toml fi # Verify version was updated echo "Updated version in pyproject.toml:" grep "version =" pyproject.toml # Conditional step for lume binary download (only for pylume package) - name: Download and setup lume binary if: inputs.is_lume_package run: | # Create a temporary directory for extraction mkdir -p temp_lume # Download the latest lume release directly echo "Downloading latest lume version..." curl -sL "https://github.com/trycua/lume/releases/latest/download/lume.tar.gz" -o temp_lume/lume.tar.gz # Extract the tar file (ignore ownership and suppress warnings) cd temp_lume && tar --no-same-owner -xzf lume.tar.gz # Make the binary executable chmod +x lume # Copy the lume binary to the correct location in the pylume package mkdir -p "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume" cp lume "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume/lume" # Verify the binary exists and is executable test -x "${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pylume/lume" || { echo "lume binary not found or not executable"; exit 1; } # Get the version from the downloaded binary for reference LUME_VERSION=$(./lume --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' || echo "unknown") echo "Using lume version: $LUME_VERSION" # Cleanup cd "${GITHUB_WORKSPACE}" && rm -rf temp_lume # Save the lume version for reference echo "LUME_VERSION=${LUME_VERSION}" >> $GITHUB_ENV - name: Build and publish env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} run: | cd ${{ inputs.package_dir }} # Build with PDM pdm build # For pylume package, verify the binary is in the wheel if [ "${{ inputs.is_lume_package }}" = "true" ]; then python -m pip install wheel wheel unpack dist/*.whl --dest temp_wheel echo "Listing contents of wheel directory:" find temp_wheel -type f test -f temp_wheel/pylume-*/pylume/lume || { echo "lume binary not found in wheel"; exit 1; } rm -rf temp_wheel echo "Publishing ${{ inputs.base_package_name }} ${VERSION} with lume ${LUME_VERSION}" else echo "Publishing ${{ inputs.base_package_name }} ${VERSION}" fi # Install and use twine directly instead of PDM publish echo "Installing twine for direct publishing..." pip install twine echo "Publishing to PyPI using twine..." TWINE_USERNAME="__token__" TWINE_PASSWORD="$PYPI_TOKEN" python -m twine upload dist/* # Save the wheel file path for the release WHEEL_FILE=$(ls dist/*.whl | head -1) echo "WHEEL_FILE=${WHEEL_FILE}" >> $GITHUB_ENV - name: Prepare Simple Release Notes if: startsWith(github.ref, 'refs/tags/') run: | # Create release notes based on package type echo "# ${{ inputs.base_package_name }} v${VERSION}" > release_notes.md echo "" >> release_notes.md if [ "${{ inputs.package_name }}" = "pylume" ]; then echo "## Python SDK for lume - run macOS and Linux VMs on Apple Silicon" >> release_notes.md echo "" >> release_notes.md echo "This package provides Python bindings for the lume virtualization tool." >> release_notes.md echo "" >> release_notes.md echo "## Dependencies" >> release_notes.md echo "* lume binary: v${LUME_VERSION}" >> release_notes.md elif [ "${{ inputs.package_name }}" = "computer" ]; then echo "## Computer control library for the Computer Universal Automation (CUA) project" >> release_notes.md echo "" >> release_notes.md echo "## Dependencies" >> release_notes.md echo "* pylume: ${PYLUME_VERSION:-latest}" >> release_notes.md elif [ "${{ inputs.package_name }}" = "agent" ]; then echo "## Dependencies" >> release_notes.md echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md echo "* cua-som: ${SOM_VERSION:-latest}" >> release_notes.md echo "" >> release_notes.md echo "## Installation Options" >> release_notes.md echo "" >> release_notes.md echo "### Basic installation with Anthropic" >> release_notes.md echo '\`\`\`bash' >> release_notes.md echo "pip install cua-agent[anthropic]==${VERSION}" >> release_notes.md echo '\`\`\`' >> release_notes.md echo "" >> release_notes.md echo "### With SOM (recommended)" >> release_notes.md echo '\`\`\`bash' >> release_notes.md echo "pip install cua-agent[som]==${VERSION}" >> release_notes.md echo '\`\`\`' >> release_notes.md echo "" >> release_notes.md echo "### All features" >> release_notes.md echo '\`\`\`bash' >> release_notes.md echo "pip install cua-agent[all]==${VERSION}" >> release_notes.md echo '\`\`\`' >> release_notes.md elif [ "${{ inputs.package_name }}" = "som" ]; then echo "## Computer Vision and OCR library for detecting and analyzing UI elements" >> release_notes.md echo "" >> release_notes.md echo "This package provides enhanced UI understanding capabilities through computer vision and OCR." >> release_notes.md elif [ "${{ inputs.package_name }}" = "computer-server" ]; then echo "## Computer Server for the Computer Universal Automation (CUA) project" >> release_notes.md echo "" >> release_notes.md echo "A FastAPI-based server implementation for computer control." >> release_notes.md echo "" >> release_notes.md echo "## Dependencies" >> release_notes.md echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md echo "" >> release_notes.md echo "## Usage" >> release_notes.md echo '\`\`\`bash' >> release_notes.md echo "# Run the server" >> release_notes.md echo "cua-computer-server" >> release_notes.md echo '\`\`\`' >> release_notes.md elif [ "${{ inputs.package_name }}" = "mcp-server" ]; then echo "## MCP Server for the Computer-Use Agent (CUA)" >> release_notes.md echo "" >> release_notes.md echo "This package provides MCP (Model Context Protocol) integration for CUA agents, allowing them to be used with Claude Desktop, Cursor, and other MCP clients." >> release_notes.md echo "" >> release_notes.md echo "## Dependencies" >> release_notes.md echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md echo "* cua-agent: ${AGENT_VERSION:-latest}" >> release_notes.md echo "" >> release_notes.md echo "## Usage" >> release_notes.md echo '\`\`\`bash' >> release_notes.md echo "# Run the MCP server directly" >> release_notes.md echo "cua-mcp-server" >> release_notes.md echo '\`\`\`' >> release_notes.md echo "" >> release_notes.md echo "## Claude Desktop Integration" >> release_notes.md echo "Add to your Claude Desktop configuration (~/.config/claude-desktop/claude_desktop_config.json or OS-specific location):" >> release_notes.md echo '\`\`\`json' >> release_notes.md echo '"mcpServers": {' >> release_notes.md echo ' "cua-agent": {' >> release_notes.md echo ' "command": "cua-mcp-server",' >> release_notes.md echo ' "args": [],' >> release_notes.md echo ' "env": {' >> release_notes.md echo ' "CUA_AGENT_LOOP": "OMNI",' >> release_notes.md echo ' "CUA_MODEL_PROVIDER": "ANTHROPIC",' >> release_notes.md echo ' "CUA_MODEL_NAME": "claude-3-opus-20240229",' >> release_notes.md echo ' "ANTHROPIC_API_KEY": "your-api-key",' >> release_notes.md echo ' "PYTHONIOENCODING": "utf-8"' >> release_notes.md echo ' }' >> release_notes.md echo ' }' >> release_notes.md echo '}' >> release_notes.md echo '\`\`\`' >> release_notes.md fi # Add installation section if not agent (which has its own installation section) if [ "${{ inputs.package_name }}" != "agent" ]; then echo "" >> release_notes.md echo "## Installation" >> release_notes.md echo '\`\`\`bash' >> release_notes.md echo "pip install ${{ inputs.base_package_name }}==${VERSION}" >> release_notes.md echo '\`\`\`' >> release_notes.md fi echo "Release notes created:" cat release_notes.md - name: Create GitHub Release uses: softprops/action-gh-release@v2 if: startsWith(github.ref, 'refs/tags/') with: name: "${{ inputs.base_package_name }} v${{ env.VERSION }}" body_path: release_notes.md files: ${{ inputs.package_dir }}/${{ env.WHEEL_FILE }} draft: false prerelease: false make_latest: ${{ inputs.package_name == 'lume' }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ``` ## /.gitignore ```gitignore path="/.gitignore" # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ !libs/lume/scripts/build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Scripts server/scripts/ # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # Ruff stuff: .ruff_cache/ # PyPI configuration file .pypirc # Conda .conda/ # Local environment .env.local # macOS DS_Store .DS_Store weights/ weights/icon_detect/ weights/icon_detect/model.pt weights/icon_detect/model.pt.zip weights/icon_detect/model.pt.zip.part* libs/omniparser/weights/icon_detect/model.pt # Example test data and output examples/test_data/ examples/output/ /screenshots/ /experiments/ /logs/ # Xcode # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore ## User settings xcuserdata/ ## Obj-C/Swift specific *.hmap ## App packaging *.ipa *.dSYM.zip *.dSYM ## Playgrounds timeline.xctimeline playground.xcworkspace # Swift Package Manager # # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. # Packages/ # Package.pins # Package.resolved # *.xcodeproj # # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata # hence it is not needed unless you have added a package configuration file to your project .swiftpm/ .build/ # CocoaPods # # We recommend against adding the Pods directory to your .gitignore. However # you should judge for yourself, the pros and cons are mentioned at: # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control # # Pods/ # # Add this line if you want to avoid checking in source code from the Xcode workspace # *.xcworkspace # Carthage # # Add this line if you want to avoid checking in source code from Carthage dependencies. # Carthage/Checkouts Carthage/Build/ # fastlane # # It is recommended to not store the screenshots in the git repo. # Instead, use fastlane to re-generate the screenshots whenever they are needed. # For more information about the recommended setup visit: # https://docs.fastlane.tools/best-practices/source-control/#source-control fastlane/report.xml fastlane/Preview.html fastlane/screenshots/**/*.png fastlane/test_output # Ignore folder ignore # .release .release/ # Shared folder shared # Trajectories trajectories/ # Installation ID Storage .storage/ # Gradio settings .gradio_settings.json ``` ## /.vscode/launch.json ```json path="/.vscode/launch.json" { "configurations": [ { "name": "Run Computer Examples", "type": "debugpy", "request": "launch", "program": "examples/computer_examples.py", "console": "integratedTerminal", "justMyCode": true, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Agent Examples", "type": "debugpy", "request": "launch", "program": "examples/agent_examples.py", "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Agent UI Examples", "type": "debugpy", "request": "launch", "program": "examples/agent_ui_examples.py", "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run PyLume Examples", "type": "debugpy", "request": "launch", "program": "examples/pylume_examples.py", "console": "integratedTerminal", "justMyCode": true, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "SOM: Run Experiments (No OCR)", "type": "debugpy", "request": "launch", "program": "examples/som_examples.py", "args": [ "examples/test_data", "--output-dir", "examples/output", "--ocr", "none", "--mode", "experiment" ], "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "SOM: Run Experiments (EasyOCR)", "type": "debugpy", "request": "launch", "program": "examples/som_examples.py", "args": [ "examples/test_data", "--output-dir", "examples/output", "--ocr", "easyocr", "--mode", "experiment" ], "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Computer Server", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/libs/computer-server/run_server.py", "console": "integratedTerminal", "justMyCode": true, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Computer Server with Args", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/libs/computer-server/run_server.py", "args": [ "--host", "0.0.0.0", "--port", "8000", "--log-level", "debug" ], "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer-server" } }, { "type": "lldb", "request": "launch", "args": [], "cwd": "${workspaceFolder:cua-root}/libs/lume", "name": "Debug lume (libs/lume)", "program": "${workspaceFolder:cua-root}/libs/lume/.build/debug/lume", "preLaunchTask": "swift: Build Debug lume (libs/lume)" }, { "type": "lldb", "request": "launch", "args": [], "cwd": "${workspaceFolder:cua-root}/libs/lume", "name": "Release lume (libs/lume)", "program": "${workspaceFolder:cua-root}/libs/lume/.build/release/lume", "preLaunchTask": "swift: Build Release lume (libs/lume)" } ] } ``` ## /.vscode/lume.code-workspace ```code-workspace path="/.vscode/lume.code-workspace" { "folders": [ { "name": "lume", "path": "../libs/lume" } ], "settings": { "files.exclude": { "**/.git": true, "**/.svn": true, "**/.hg": true, "**/CVS": true, "**/.DS_Store": true }, "swift.path.swift_driver_bin": "/usr/bin/swift", "swift.enableLanguageServer": true, "files.associations": { "*.swift": "swift" }, "[swift]": { "editor.formatOnSave": true, "editor.detectIndentation": true, "editor.tabSize": 4 }, "swift.path": "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin", "swift.swiftEnvironmentVariables": { "DEVELOPER_DIR": "/Applications/Xcode.app" }, "lldb.library": "/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Versions/A/LLDB", "lldb.launch.expressions": "native" }, "tasks": { "version": "2.0.0", "tasks": [ { "label": "build-debug", "type": "shell", "command": "${workspaceFolder:lume}/scripts/build/build-debug.sh", "options": { "cwd": "${workspaceFolder:lume}" }, "group": { "kind": "build", "isDefault": true }, "presentation": { "reveal": "silent", "panel": "shared" }, "problemMatcher": [] }, { "label": "swift: Build Debug lume", "type": "shell", "command": "${workspaceFolder:lume}/scripts/build/build-debug.sh", "options": { "cwd": "${workspaceFolder:lume}" }, "group": "build", "presentation": { "reveal": "silent", "panel": "shared" }, "problemMatcher": [] } ] }, "launch": { "configurations": [ { "type": "bashdb", "request": "launch", "name": "Bash-Debug (select script from list of sh files)", "cwd": "${workspaceFolder}", "program": "${command:SelectScriptName}", "pathBash": "/opt/homebrew/bin/bash", "args": [] }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "serve" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume serve", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "create", "macos-vm", "--cpu", "4", "--memory", "4GB", "--disk-size", "40GB", "--ipsw", "/Users//Downloads/UniversalMac_15.2_24C101_Restore.ipsw" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume create --os macos --ipsw 'path/to/ipsw' (macos)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "create", "macos-vm", "--cpu", "4", "--memory", "4GB", "--disk-size", "20GB", "--ipsw", "latest" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume create --os macos --ipsw latest (macos)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "create", "linux-vm", "--os", "linux", "--cpu", "4", "--memory", "4GB", "--disk-size", "20GB" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume create --os linux (linux)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "pull", "macos-sequoia-vanilla:15.2", "--name", "macos-vm-cloned" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume pull (macos)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "run", "macos-vm", "--shared-dir", "/Users//repos/trycua/lume/shared_folder:rw", "--start-vnc" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume run (macos)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "run", "linux-vm", "--start-vnc", "--mount", "/Users//Downloads/ubuntu-24.04.1-live-server-arm64.iso" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume run with setup mount (linux)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "run", "linux-vm", "--start-vnc" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume run (linux)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "get", "macos-vm" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume get (macos)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "ls" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume ls", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "images" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume images", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "sourceLanguages": [ "swift" ], "args": [ "stop", "macos-vm" ], "cwd": "${workspaceFolder:lume}", "name": "Debug lume stop (macos)", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "build-debug" }, { "type": "lldb", "request": "launch", "args": [], "cwd": "${workspaceFolder:lume}", "name": "Debug lume", "program": "${workspaceFolder:lume}/.build/debug/lume", "preLaunchTask": "swift: Build Debug lume" }, { "type": "lldb", "request": "launch", "args": [], "cwd": "${workspaceFolder:lume}", "name": "Release lume", "program": "${workspaceFolder:lume}/.build/release/lume", "preLaunchTask": "swift: Build Release lume" }, { "type": "bashdb", "request": "launch", "name": "Bash-Debug (select script)", "cwd": "${workspaceFolder:lume}", "program": "${command:SelectScriptName}", "pathBash": "/opt/homebrew/bin/bash", "args": [] } ] } } ``` ## /.vscode/py.code-workspace ```code-workspace path="/.vscode/py.code-workspace" { "folders": [ { "name": "cua-root", "path": ".." }, { "name": "computer", "path": "../libs/computer" }, { "name": "agent", "path": "../libs/agent" }, { "name": "som", "path": "../libs/som" }, { "name": "computer-server", "path": "../libs/computer-server" }, { "name": "pylume", "path": "../libs/pylume" }, { "name": "core", "path": "../libs/core" } ], "settings": { "files.exclude": { "**/.git": true, "**/.svn": true, "**/.hg": true, "**/CVS": true, "**/.DS_Store": true, "**/__pycache__": true, "**/.pytest_cache": true, "**/*.pyc": true }, "python.testing.pytestEnabled": true, "python.testing.unittestEnabled": false, "python.testing.nosetestsEnabled": false, "python.testing.pytestArgs": [ "libs" ], "python.analysis.extraPaths": [ "${workspaceFolder:cua-root}/libs/core", "${workspaceFolder:cua-root}/libs/computer", "${workspaceFolder:cua-root}/libs/agent", "${workspaceFolder:cua-root}/libs/som", "${workspaceFolder:cua-root}/libs/pylume", "${workspaceFolder:cua-root}/.vscode/typings" ], "python.envFile": "${workspaceFolder:cua-root}/.env", "python.defaultInterpreterPath": "${workspaceFolder:cua-root}/.venv/bin/python", "python.analysis.diagnosticMode": "workspace", "python.analysis.typeCheckingMode": "basic", "python.analysis.autoSearchPaths": true, "python.analysis.stubPath": "${workspaceFolder:cua-root}/.vscode/typings", "python.analysis.indexing": false, "python.analysis.exclude": [ "**/node_modules/**", "**/__pycache__/**", "**/.*/**", "**/venv/**", "**/.venv/**", "**/dist/**", "**/build/**", ".pdm-build/**", "**/.git/**", "examples/**", "notebooks/**", "logs/**", "screenshots/**" ], "python.analysis.packageIndexDepths": [ { "name": "computer", "depth": 2 }, { "name": "agent", "depth": 2 }, { "name": "som", "depth": 2 }, { "name": "pylume", "depth": 2 }, { "name": "core", "depth": 2 } ], "python.autoComplete.extraPaths": [ "${workspaceFolder:cua-root}/libs/core", "${workspaceFolder:cua-root}/libs/computer", "${workspaceFolder:cua-root}/libs/agent", "${workspaceFolder:cua-root}/libs/som", "${workspaceFolder:cua-root}/libs/pylume" ], "python.languageServer": "Pylance", "[python]": { "editor.formatOnSave": true, "editor.defaultFormatter": "ms-python.black-formatter", "editor.codeActionsOnSave": { "source.organizeImports": "explicit" } }, "files.associations": { "examples/computer_examples.py": "python", "examples/agent_examples.py": "python" }, "python.interpreterPaths": { "examples/computer_examples.py": "${workspaceFolder}/libs/computer/.venv/bin/python", "examples/agent_examples.py": "${workspaceFolder}/libs/agent/.venv/bin/python" } }, "tasks": { "version": "2.0.0", "tasks": [ { "label": "Build Dependencies", "type": "shell", "command": "${workspaceFolder}/scripts/build.sh", "presentation": { "reveal": "always", "panel": "new", "clear": true }, "group": { "kind": "build", "isDefault": true }, "options": { "shell": { "executable": "/bin/bash", "args": ["-l", "-c"] } }, "problemMatcher": [] } ] }, "launch": { "version": "0.2.0", "configurations": [ { "name": "Run Computer Examples", "type": "debugpy", "request": "launch", "program": "examples/computer_examples.py", "console": "integratedTerminal", "justMyCode": true, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Agent Examples", "type": "debugpy", "request": "launch", "program": "examples/agent_examples.py", "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run PyLume Examples", "type": "debugpy", "request": "launch", "program": "examples/pylume_examples.py", "console": "integratedTerminal", "justMyCode": true, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "SOM: Run Experiments (No OCR)", "type": "debugpy", "request": "launch", "program": "examples/som_examples.py", "args": [ "examples/test_data", "--output-dir", "examples/output", "--ocr", "none", "--mode", "experiment" ], "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "SOM: Run Experiments (EasyOCR)", "type": "debugpy", "request": "launch", "program": "examples/som_examples.py", "args": [ "examples/test_data", "--output-dir", "examples/output", "--ocr", "easyocr", "--mode", "experiment" ], "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Computer Server", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/libs/computer-server/run_server.py", "console": "integratedTerminal", "justMyCode": true, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer:${workspaceFolder:cua-root}/libs/agent:${workspaceFolder:cua-root}/libs/som:${workspaceFolder:cua-root}/libs/pylume" } }, { "name": "Run Computer Server with Args", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/libs/computer-server/run_server.py", "args": [ "--host", "0.0.0.0", "--port", "8000", "--log-level", "debug" ], "console": "integratedTerminal", "justMyCode": false, "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { "PYTHONPATH": "${workspaceFolder:cua-root}/libs/core:${workspaceFolder:cua-root}/libs/computer-server" } } ] }, "compounds": [ { "name": "Run Computer Examples + Server", "configurations": ["Run Computer Examples", "Run Computer Server"], "stopAll": true, "presentation": { "group": "Computer", "order": 1 } }, { "name": "Run Server with Keep-Alive Client", "configurations": ["Run Computer Server", "Test Server Connection (Keep Alive)"], "stopAll": true, "presentation": { "group": "Computer", "order": 2 } } ], "inputs": [ { "id": "imagePath", "type": "promptString", "description": "Path to the image file or directory for icon detection", "default": "${workspaceFolder}/examples/test_data" } ] } ``` ## /CONTRIBUTING.md # Contributing to cua We deeply appreciate your interest in contributing to cua! Whether you're reporting bugs, suggesting enhancements, improving docs, or submitting pull requests, your contributions help improve the project for everyone. ## Reporting Bugs If you've encountered a bug in the project, we encourage you to report it. Please follow these steps: 1. **Check the Issue Tracker**: Before submitting a new bug report, please check our issue tracker to see if the bug has already been reported. 2. **Create a New Issue**: If the bug hasn't been reported, create a new issue with: - A clear title and detailed description - Steps to reproduce the issue - Expected vs actual behavior - Your environment (macOS version, lume version) - Any relevant logs or error messages 3. **Label Your Issue**: Label your issue as a `bug` to help maintainers identify it quickly. ## Suggesting Enhancements We're always looking for suggestions to make lume better. If you have an idea: 1. **Check Existing Issues**: See if someone else has already suggested something similar. 2. **Create a New Issue**: If your enhancement is new, create an issue describing: - The problem your enhancement solves - How your enhancement would work - Any potential implementation details - Why this enhancement would benefit lume users ## Code Formatting We follow strict code formatting guidelines to ensure consistency across the codebase. Before submitting any code: 1. **Review Our Format Guide**: Please review our [Code Formatting Standards](docs/Developer-Guide.md#code-formatting-standards) section in the Getting Started guide. 2. **Configure Your IDE**: We recommend using the workspace settings provided in `.vscode/` for automatic formatting. 3. **Run Formatting Tools**: Always run the formatting tools before submitting a PR: ```bash # For Python code pdm run black . pdm run ruff check --fix . ``` 4. **Validate Your Code**: Ensure your code passes all checks: ```bash pdm run mypy . ``` ## Documentation Documentation improvements are always welcome. You can: - Fix typos or unclear explanations - Add examples and use cases - Improve API documentation - Add tutorials or guides For detailed instructions on setting up your development environment and submitting code contributions, please see our [Developer-Guide](./docs/Developer-Guide.md) guide. Feel free to join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get help with your contributions. ## /Dockerfile ``` path="/Dockerfile" FROM python:3.11-slim # Set environment variables ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ PYTHONPATH="/app/libs/core:/app/libs/computer:/app/libs/agent:/app/libs/som:/app/libs/pylume:/app/libs/computer-server" # Install system dependencies for ARM architecture RUN apt-get update && apt-get install -y --no-install-recommends \ git \ build-essential \ libgl1-mesa-glx \ libglib2.0-0 \ libxcb-xinerama0 \ libxkbcommon-x11-0 \ cmake \ pkg-config \ curl \ iputils-ping \ net-tools \ sed \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # Set working directory WORKDIR /app # Copy the entire project temporarily # We'll mount the real source code over this at runtime COPY . /app/ # Create a simple .env.local file for build.sh RUN echo "PYTHON_BIN=python" > /app/.env.local # Modify build.sh to skip virtual environment creation RUN sed -i 's/python -m venv .venv/echo "Skipping venv creation in Docker"/' /app/scripts/build.sh && \ sed -i 's/source .venv\/bin\/activate/echo "Skipping venv activation in Docker"/' /app/scripts/build.sh && \ sed -i 's/find . -type d -name ".venv" -exec rm -rf {} +/echo "Skipping .venv removal in Docker"/' /app/scripts/build.sh && \ chmod +x /app/scripts/build.sh # Run the build script to install dependencies RUN cd /app && ./scripts/build.sh # Clean up the source files now that dependencies are installed # When we run the container, we'll mount the actual source code RUN rm -rf /app/* /app/.??* # Note: This Docker image doesn't contain the lume executable (macOS-specific) # Instead, it relies on connecting to a lume server running on the host machine # via host.docker.internal:3000 # Default command CMD ["bash"] ``` ## /LICENSE.md MIT License Copyright (c) 2025 trycua Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ## /README.md
Cua logo [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#) [![Swift](https://img.shields.io/badge/Swift-F05138?logo=swift&logoColor=white)](#) [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#) [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
**TL;DR**: **c/ua** (pronounced "koo-ah", short for Computer-Use Agent) is a framework that enables AI agents to control full operating systems within high-performance, lightweight virtual containers. It delivers up to 97% native speed on Apple Silicon and works with any vision language models. ## What is c/ua? **c/ua** offers two primary capabilities in a single integrated framework: 1. **High-Performance Virtualization** - Create and run macOS/Linux virtual machines on Apple Silicon with near-native performance (up to 97% of native speed) using the **Lume CLI** with `Apple's Virtualization.Framework`. 2. **Computer-Use Interface & Agent** - A framework that allows AI systems to observe and control these virtual environments - interacting with applications, browsing the web, writing code, and performing complex workflows. ## Why Use c/ua? - **Security & Isolation**: Run AI agents in fully isolated virtual environments instead of giving them access to your main system - **Performance**: [Near-native performance](https://browser.geekbench.com/v6/cpu/compare/11283746?baseline=11102709) on Apple Silicon - **Flexibility**: Run macOS or Linux environments with the same framework - **Reproducibility**: Create consistent, deterministic environments for AI agent workflows - **LLM Integration**: Built-in support for connecting to various LLM providers ## System Requirements - Mac with Apple Silicon (M1/M2/M3/M4 series) - macOS 15 (Sequoia) or newer - Python 3.10+ (required for the Computer, Agent, and MCP libraries). We recommend using Conda (or Anaconda) to create an ad hoc Python environment. - Disk space for VM images (30GB+ recommended) ## Quick Start ### Option 1: Lume CLI Only (VM Management) If you only need the virtualization capabilities: ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` For Lume usage instructions, refer to the [Lume documentation](./libs/lume/README.md). ### Option 2: Full Computer-Use Agent Capabilities If you want to use AI agents with virtualized environments: 1. Install the Lume CLI: ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` 2. Pull the latest macOS CUA image: ```bash lume pull macos-sequoia-cua:latest ``` 3. Start Lume daemon service: ```bash lume serve ``` 4. Install the Python libraries: ```bash pip install cua-computer cua-agent[all] ``` 5. Use the libraries in your Python code: ```python from computer import Computer from agent import ComputerAgent, LLM, AgentLoop, LLMProvider async with Computer(verbosity=logging.DEBUG) as macos_computer: agent = ComputerAgent( computer=macos_computer, loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC) ) tasks = [ "Look for a repository named trycua/cua on GitHub.", ] for task in tasks: async for result in agent.run(task): print(result) ``` Explore the [Agent Notebook](./notebooks/) for a ready-to-run example. 6. Optionally, you can use the Agent with a Gradio UI: ```python from utils import load_dotenv_files load_dotenv_files() from agent.ui.gradio.app import create_gradio_ui app = create_gradio_ui() app.launch(share=False) ``` ### Option 3: Build from Source (Nightly) If you want to contribute to the project or need the latest nightly features: ```bash # Clone the repository git clone https://github.com/trycua/cua.git cd cua # Open the project in VSCode code ./.vscode/py.code-workspace # Build the project ./scripts/build.sh ``` See our [Developer-Guide](./docs/Developer-Guide.md) for more information. ## Monorepo Libraries | Library | Description | Installation | Version | |---------|-------------|--------------|---------| | [**Lume**](./libs/lume/README.md) | CLI for running macOS/Linux VMs with near-native performance using Apple's `Virtualization.Framework`. | [![Download](https://img.shields.io/badge/Download-333333?style=for-the-badge&logo=github&logoColor=white)](https://github.com/trycua/cua/releases/latest/download/lume.pkg.tar.gz) | [![GitHub release](https://img.shields.io/github/v/release/trycua/cua?color=333333)](https://github.com/trycua/cua/releases) | | [**Computer**](./libs/computer/README.md) | Computer-Use Interface (CUI) framework for interacting with macOS/Linux sandboxes | `pip install cua-computer` | [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/) | | [**Agent**](./libs/agent/README.md) | Computer-Use Agent (CUA) framework for running agentic workflows in macOS/Linux dedicated sandboxes | `pip install cua-agent` | [![PyPI](https://img.shields.io/pypi/v/cua-agent?color=333333)](https://pypi.org/project/cua-agent/) | ## Docs For the best onboarding experience with the packages in this monorepo, we recommend starting with the [Computer](./libs/computer/README.md) documentation to cover the core functionality of the Computer sandbox, then exploring the [Agent](./libs/agent/README.md) documentation to understand Cua's AI agent capabilities, and finally working through the Notebook examples. - [Lume](./libs/lume/README.md) - [Computer](./libs/computer/README.md) - [Agent](./libs/agent/README.md) - [Notebooks](./notebooks/) ## Demos Demos of the Computer-Use Agent in action. Share your most impressive demos in Cua's [Discord community](https://discord.com/invite/mVnXXpdE85)!
MCP Server: Work with Claude Desktop and Tableau
AI-Gradio: multi-app workflow requiring browser, VS Code and terminal access
Notebook: Fix GitHub issue in Cursor
## Accessory Libraries | Library | Description | Installation | Version | |---------|-------------|--------------|---------| | [**Core**](./libs/core/README.md) | Core functionality and utilities used by other Cua packages | `pip install cua-core` | [![PyPI](https://img.shields.io/pypi/v/cua-core?color=333333)](https://pypi.org/project/cua-core/) | | [**PyLume**](./libs/pylume/README.md) | Python bindings for Lume | `pip install pylume` | [![PyPI](https://img.shields.io/pypi/v/pylume?color=333333)](https://pypi.org/project/pylume/) | | [**Computer Server**](./libs/computer-server/README.md) | Server component for the Computer-Use Interface (CUI) framework | `pip install cua-computer-server` | [![PyPI](https://img.shields.io/pypi/v/cua-computer-server?color=333333)](https://pypi.org/project/cua-computer-server/) | | [**SOM**](./libs/som/README.md) | Self-of-Mark library for Agent | `pip install cua-som` | [![PyPI](https://img.shields.io/pypi/v/cua-som?color=333333)](https://pypi.org/project/cua-som/) | ## Contributing We welcome and greatly appreciate contributions to Cua! Whether you're improving documentation, adding new features, fixing bugs, or adding new VM images, your efforts help make lume better for everyone. For detailed instructions on how to contribute, please refer to our [Contributing Guidelines](CONTRIBUTING.md). Join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get assistance. ## License Cua is open-sourced under the MIT License - see the [LICENSE](LICENSE) file for details. Microsoft's OmniParser, which is used in this project, is licensed under the Creative Commons Attribution 4.0 International License (CC-BY-4.0) - see the [OmniParser LICENSE](https://github.com/microsoft/OmniParser/blob/master/LICENSE) file for details. ## Trademarks Apple, macOS, and Apple Silicon are trademarks of Apple Inc. Ubuntu and Canonical are registered trademarks of Canonical Ltd. Microsoft is a registered trademark of Microsoft Corporation. This project is not affiliated with, endorsed by, or sponsored by Apple Inc., Canonical Ltd., or Microsoft Corporation. ## Stargazers over time [![Stargazers over time](https://starchart.cc/trycua/cua.svg?variant=adaptive)](https://starchart.cc/trycua/cua) ## Contributors
f-trycua
f-trycua

💻
Pedro Piñera Buendía
Pedro Piñera Buendía

💻
Amit Kumar
Amit Kumar

💻
Dung Duc Huynh (Kaka)
Dung Duc Huynh (Kaka)

💻
Zayd Krunz
Zayd Krunz

💻
Prashant Raj
Prashant Raj

💻
Leland Takamine
Leland Takamine

💻
ddupont
ddupont

💻
Ethan Gutierrez
Ethan Gutierrez

💻
Ricter Zheng
Ricter Zheng

💻
Rahul Karajgikar
Rahul Karajgikar

💻
trospix
trospix

💻
Ikko Eltociear Ashimine
Ikko Eltociear Ashimine

💻
한석호(MilKyo)
한석호(MilKyo)

💻
Rahim Nathwani
Rahim Nathwani

💻
Matt Speck
Matt Speck

💻
## /docs/Developer-Guide.md # Getting Started ## Project Structure The project is organized as a monorepo with these main packages: - `libs/core/` - Base package with telemetry support - `libs/computer/` - Computer-use interface (CUI) library - `libs/agent/` - AI agent library with multi-provider support - `libs/som/` - Set-of-Mark parser - `libs/computer-server/` - Server component for VM - `libs/lume/` - Lume CLI - `libs/pylume/` - Python bindings for Lume Each package has its own virtual environment and dependencies, managed through PDM. ## Local Development Setup 1. Install Lume CLI: ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` 2. Clone the repository: ```bash git clone https://github.com/trycua/cua.git cd cua ``` 3. Create a `.env.local` file in the root directory with your API keys: ```bash # Required for Anthropic provider ANTHROPIC_API_KEY=your_anthropic_key_here # Required for OpenAI provider OPENAI_API_KEY=your_openai_key_here ``` 4. Open the workspace in VSCode or Cursor: ```bash # For Cua Python development code .vscode/py.code-workspace # For Lume (Swift) development code .vscode/lume.code-workspace ``` Using the workspace file is strongly recommended as it: - Sets up correct Python environments for each package - Configures proper import paths - Enables debugging configurations - Maintains consistent settings across packages ## Lume Development Refer to the [Lume README](../libs/lume/docs/Development.md) for instructions on how to develop the Lume CLI. ## Python Development There are two ways to instal Lume: ### Run the build script Run the build script to set up all packages: ```bash ./scripts/build.sh ``` The build script creates a shared virtual environment for all packages. The workspace configuration automatically handles import paths with the correct Python path settings. This will: - Create a virtual environment for the project - Install all packages in development mode - Set up the correct Python path - Install development tools ### Install with PDM If PDM is not already installed, you can follow the installation instructions [here](https://pdm-project.org/en/latest/#installation). To install with PDM, simply run: ```console pdm install -G:all ``` This installs all the dependencies for development, testing, and building the docs. If you'd oly like development dependencies, you can run: ```console pdm install -d ``` ## Running Examples The Python workspace includes launch configurations for all packages: - "Run Computer Examples" - Runs computer examples - "Run Computer API Server" - Runs the computer-server - "Run Agent Examples" - Runs agent examples - "SOM" configurations - Various settings for running SOM To run examples from VSCode / Cursor: 1. Press F5 or use the Run/Debug view 2. Select the desired configuration The workspace also includes compound launch configurations: - "Run Computer Examples + Server" - Runs both the Computer Examples and Server simultaneously ## Docker Development Environment As an alternative to installing directly on your host machine, you can use Docker for development. This approach has several advantages: ### Prerequisites - Docker installed on your machine - Lume server running on your host (port 3000): `lume serve` ### Setup and Usage 1. Build the development Docker image: ```bash ./scripts/run-docker-dev.sh build ``` 2. Run an example in the container: ```bash ./scripts/run-docker-dev.sh run computer_examples.py ``` 3. Get an interactive shell in the container: ```bash ./scripts/run-docker-dev.sh run --interactive ``` 4. Stop any running containers: ```bash ./scripts/run-docker-dev.sh stop ``` ### How it Works The Docker development environment: - Installs all required Python dependencies in the container - Mounts your source code from the host at runtime - Automatically configures the connection to use host.docker.internal:3000 for accessing the Lume server on your host machine - Preserves your code changes without requiring rebuilds (source code is mounted as a volume) > **Note**: The Docker container doesn't include the macOS-specific Lume executable. Instead, it connects to the Lume server running on your host machine via host.docker.internal:3000. Make sure to start the Lume server on your host before running examples in the container. ## Cleanup and Reset If you need to clean up the environment (non-docker) and start fresh: ```bash ./scripts/cleanup.sh ``` This will: - Remove all virtual environments - Clean Python cache files and directories - Remove build artifacts - Clean PDM-related files - Reset environment configurations ## Code Formatting Standards The cua project follows strict code formatting standards to ensure consistency across all packages. ### Python Code Formatting #### Tools The project uses the following tools for code formatting and linting: - **[Black](https://black.readthedocs.io/)**: Code formatter - **[Ruff](https://beta.ruff.rs/docs/)**: Fast linter and formatter - **[MyPy](https://mypy.readthedocs.io/)**: Static type checker These tools are automatically installed when you set up the development environment using the `./scripts/build.sh` script. #### Configuration The formatting configuration is defined in the root `pyproject.toml` file: ```toml [tool.black] line-length = 100 target-version = ["py310"] [tool.ruff] line-length = 100 target-version = "py310" select = ["E", "F", "B", "I"] fix = true [tool.ruff.format] docstring-code-format = true [tool.mypy] strict = true python_version = "3.10" ignore_missing_imports = true disallow_untyped_defs = true check_untyped_defs = true warn_return_any = true show_error_codes = true warn_unused_ignores = false ``` #### Key Formatting Rules - **Line Length**: Maximum of 100 characters - **Python Version**: Code should be compatible with Python 3.10+ - **Imports**: Automatically sorted (using Ruff's "I" rule) - **Type Hints**: Required for all function definitions (strict mypy mode) #### IDE Integration The repository includes VSCode workspace configurations that enable automatic formatting. When you open the workspace files (as recommended in the setup instructions), the correct formatting settings are automatically applied. Python-specific settings in the workspace files: ```json "[python]": { "editor.formatOnSave": true, "editor.defaultFormatter": "ms-python.black-formatter", "editor.codeActionsOnSave": { "source.organizeImports": "explicit" } } ``` Recommended VS Code extensions: - Black Formatter (ms-python.black-formatter) - Ruff (charliermarsh.ruff) - Pylance (ms-python.vscode-pylance) #### Manual Formatting To manually format code: ```bash # Format all Python files using Black pdm run black . # Run Ruff linter with auto-fix pdm run ruff check --fix . # Run type checking with MyPy pdm run mypy . ``` #### Pre-commit Validation Before submitting a pull request, ensure your code passes all formatting checks: ```bash # Run all checks pdm run black --check . pdm run ruff check . pdm run mypy . ``` ### Swift Code (Lume) For Swift code in the `libs/lume` directory: - Follow the [Swift API Design Guidelines](https://www.swift.org/documentation/api-design-guidelines/) - Use SwiftFormat for consistent formatting - Code will be automatically formatted on save when using the lume workspace ## /docs/FAQ.md # FAQs ### Why a local sandbox? A local sandbox is a dedicated environment that is isolated from the rest of the system. As AI agents rapidly evolve towards 70-80% success rates on average tasks, having a controlled and secure environment becomes crucial. Cua's Computer-Use AI agents run in a local sandbox to ensure reliability, safety, and controlled execution. Benefits of using a local sandbox rather than running the Computer-Use AI agent in the host system: - **Reliability**: The sandbox provides a reproducible environment - critical for benchmarking and debugging agent behavior. Frameworks like [OSWorld](https://github.com/xlang-ai/OSWorld), [Simular AI](https://github.com/simular-ai/Agent-S), Microsoft's [OmniTool](https://github.com/microsoft/OmniParser/tree/master/omnitool), [WindowsAgentArena](https://github.com/microsoft/WindowsAgentArena) and more are using Computer-Use AI agents running in local sandboxes. - **Safety & Isolation**: The sandbox is isolated from the rest of the system, protecting sensitive data and system resources. As CUA agent capabilities grow, this isolation becomes increasingly important for preventing potential safety breaches. - **Control**: The sandbox can be easily monitored and terminated if needed, providing oversight for autonomous agent operation. ### Where are the sandbox images stored? Sandbox are stored in `~/.lume`, and cached images are stored in `~/.lume/cache`. ### Which image is Computer using? Computer uses an optimized macOS image for Computer-Use interactions, with pre-installed apps and settings for optimal performance. The image is available on our [ghcr registry](https://github.com/orgs/trycua/packages/container/package/macos-sequoia-cua). ### Are Sandbox disks taking up all the disk space? No, macOS uses sparse files, which only allocate space as needed. For example, VM disks totaling 50 GB may only use 20 GB on disk. ### How do I delete a VM? ```bash lume delete ``` ### How do I fix EasyOCR `[SSL: CERTIFICATE_VERIFY_FAILED]` errors? **Symptom:** When running an agent that uses OCR (e.g., with `AgentLoop.OMNI`), you might encounter an error during the first run or initialization phase that includes: ``` ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000) ``` **Cause:** This usually happens when EasyOCR attempts to download its language models over HTTPS for the first time. Python's SSL module cannot verify the server's certificate because it can't locate the necessary root Certificate Authority (CA) certificates in your environment's trust store. **Solution:** You need to explicitly tell Python where to find a trusted CA bundle. The `certifi` package provides one. Before running your Python agent script **the first time it needs to download models**, set the following environment variables in the *same terminal session*: ```bash # Ensure certifi is installed: pip show certifi export SSL_CERT_FILE=$(python -m certifi) export REQUESTS_CA_BUNDLE=$(python -m certifi) # Now run your Python script that uses the agent... # python your_agent_script.py ``` This directs Python to use the CA bundle provided by `certifi` for SSL verification. **Note:** Once EasyOCR has successfully downloaded its models, you typically do not need to set these environment variables before every subsequent run. ### How do I troubleshoot the agent failing to get the VM IP address or getting stuck on "VM status changed to: stopped"? **Symptom:** When running your agent script (e.g., using `Computer().run(...)`), the script might hang during the VM startup phase, logging messages like: * `Waiting for VM to be ready...` * `VM status changed to: stopped (after 0.0s)` * `Still waiting for VM IP address... (elapsed: XX.Xs)` * Eventually, it might time out, or you might notice the VM window never appears or closes quickly. **Cause:** This is typically due to known instability issues with the `lume serve` background daemon process, as documented in the main `README.md`: 1. **`lume serve` Crash:** The `lume serve` process might terminate unexpectedly shortly after launch or when the script tries to interact with it. If it's not running, the script cannot get VM status updates or the IP address. 2. **Incorrect Status Reporting:** Even if `lume serve` is running, its API sometimes incorrectly reports the VM status as `stopped` immediately after startup is initiated. While the underlying `Computer` library tries to poll and wait for the correct `running` status, this initial incorrect report can cause delays or failures if the status doesn't update correctly within the timeout or if `lume serve` crashes during the polling. **Troubleshooting Steps:** 1. **Check `lume serve`:** Is the `lume serve` process still running in its terminal? Did it print any errors or exit? If it's not running, stop your agent script (`Ctrl+C`) and proceed to step 2. 2. **Force Cleanup:** Before *every* run, perform a rigorous cleanup to ensure no old `lume` processes or VM states interfere. Open a **new terminal** and run: ```bash # Stop any running Lume VM gracefully first (replace if needed) lume stop macos-sequoia-cua_latest # Force kill lume serve and related processes pkill -f "lume serve" pkill -9 -f "lume" pkill -9 -f "VzVirtualMachine" # Kills underlying VM process # Optional: Verify they are gone # ps aux | grep -E 'lume|VzVirtualMachine' | grep -v grep ``` 3. **Restart Sequence:** * **Terminal 1:** Start `lume serve` cleanly: ```bash lume serve ``` *(Watch this terminal to ensure it stays running).* * **Terminal 2:** Run your agent script (including the `export SSL_CERT_FILE...` commands if *first time* using OCR): ```bash # export SSL_CERT_FILE=$(python -m certifi) # Only if first run with OCR # export REQUESTS_CA_BUNDLE=$(python -m certifi) # Only if first run with OCR python your_agent_script.py ``` 4. **Retry:** Due to the intermittent nature of the Lume issues, sometimes simply repeating steps 2 and 3 allows the run to succeed if the timing avoids the status reporting bug or the `lume serve` crash. **Related Issue: "No route to host" Error (macOS Sequoia+)** * **Symptom:** Even if the `Computer` library logs show the VM has obtained an IP address, you might encounter connection errors like `No route to host` when the agent tries to connect to the internal server, especially when running the agent script from within an IDE (like VS Code or Cursor). * **Cause:** This is often due to macOS Sequoia's enhanced local network privacy controls. Applications need explicit permission to access the local network, which includes communicating with the VM. * **Solution:** Grant "Local Network" access to the application you are running the script from (e.g., your IDE or terminal application). Go to **System Settings > Privacy & Security > Local Network**, find your application in the list, and toggle the switch ON. You might need to trigger a connection attempt from the application first for it to appear in the list. See [GitHub Issue #61](https://github.com/trycua/cua/issues/61) for more details and discussion. **Note:** Improving the stability of `lume serve` is an ongoing development area. ### How do I troubleshoot Computer not connecting to lume daemon? If you're experiencing connection issues between Computer and the lume daemon, it could be because the port 3000 (used by lume) is already in use by an orphaned process. You can diagnose this issue with: ```bash sudo lsof -i :3000 ``` This command will show all processes using port 3000. If you see a lume process already running, you can terminate it with: ```bash kill ``` Where `` is the process ID shown in the output of the `lsof` command. After terminating the process, run `lume serve` again to start the lume daemon. ### What information does Cua track? Cua tracks anonymized usage and error report statistics; we ascribe to Posthog's approach as detailed [here](https://posthog.com/blog/open-source-telemetry-ethical). If you would like to opt out of sending anonymized info, you can set `telemetry_enabled` to false in the Computer or Agent constructor. Check out our [Telemetry](Telemetry.md) documentation for more details. ## /docs/Telemetry.md # Telemetry in CUA This document explains how telemetry works in CUA libraries and how you can control it. CUA tracks anonymized usage and error report statistics; we ascribe to Posthog's approach as detailed [here](https://posthog.com/blog/open-source-telemetry-ethical). If you would like to opt out of sending anonymized info, you can set `telemetry_enabled` to false. ## What telemetry data we collect CUA libraries collect minimal anonymous usage data to help improve our software. The telemetry data we collect is specifically limited to: - Basic system information: - Operating system (e.g., 'darwin', 'win32', 'linux') - Python version (e.g., '3.10.0') - Module initialization events: - When a module (like 'computer' or 'agent') is imported - Version of the module being used We do NOT collect: - Personal information - Contents of files - Specific text being typed - Actual screenshots or screen contents - User-specific identifiers - API keys - File contents - Application data or content - User interactions with the computer - Information about files being accessed ## Controlling Telemetry We are committed to transparency and user control over telemetry. There are two ways to control telemetry: ## 1. Environment Variable (Global Control) Telemetry is enabled by default. To disable telemetry, set the `CUA_TELEMETRY_ENABLED` environment variable to a falsy value (`0`, `false`, `no`, or `off`): ```bash # Disable telemetry before running your script export CUA_TELEMETRY_ENABLED=false # Or as part of the command CUA_TELEMETRY_ENABLED=1 python your_script.py ``` Or from Python: ```python import os os.environ["CUA_TELEMETRY_ENABLED"] = "false" ``` ## 2. Instance-Level Control You can control telemetry for specific CUA instances by setting `telemetry_enabled` when creating them: ```python # Disable telemetry for a specific Computer instance computer = Computer(telemetry_enabled=False) # Enable telemetry for a specific Agent instance agent = ComputerAgent(telemetry_enabled=True) ``` You can check if telemetry is enabled for an instance: ```python print(computer.telemetry_enabled) # Will print True or False ``` Note that telemetry settings must be configured during initialization and cannot be changed after the object is created. ## Transparency We believe in being transparent about the data we collect. If you have any questions about our telemetry practices, please open an issue on our GitHub repository. ## /examples/agent_examples.py ```py path="/examples/agent_examples.py" """Example demonstrating the ComputerAgent capabilities with the Omni provider.""" import asyncio import logging import traceback import signal from computer import Computer # Import the unified agent class and types from agent import ComputerAgent, LLMProvider, LLM, AgentLoop # Import utility functions from utils import load_dotenv_files, handle_sigint # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) async def run_agent_example(): """Run example of using the ComputerAgent with OpenAI and Omni provider.""" print("\n=== Example: ComputerAgent with OpenAI and Omni provider ===") try: # Create Computer instance with async context manager async with Computer(verbosity=logging.DEBUG) as macos_computer: # Create agent with loop and provider agent = ComputerAgent( computer=macos_computer, # loop=AgentLoop.OPENAI, # loop=AgentLoop.ANTHROPIC, # loop=AgentLoop.UITARS, loop=AgentLoop.OMNI, # model=LLM(provider=LLMProvider.OPENAI), # No model name for Operator CUA # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"), # model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"), # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"), model=LLM( provider=LLMProvider.OAICOMPAT, name="gemma-3-12b-it", provider_base_url="http://localhost:1234/v1", # LM Studio local endpoint ), save_trajectory=True, only_n_most_recent_images=3, verbosity=logging.DEBUG, ) tasks = [ "Look for a repository named trycua/cua on GitHub.", "Check the open issues, open the most recent one and read it.", "Clone the repository in users/lume/projects if it doesn't exist yet.", "Open the repository with an app named Cursor (on the dock, black background and white cube icon).", "From Cursor, open Composer if not already open.", "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.", ] for i, task in enumerate(tasks): print(f"\nExecuting task {i}/{len(tasks)}: {task}") async for result in agent.run(task): print("Response ID: ", result.get("id")) # Print detailed usage information usage = result.get("usage") if usage: print("\nUsage Details:") print(f" Input Tokens: {usage.get('input_tokens')}") if "input_tokens_details" in usage: print(f" Input Tokens Details: {usage.get('input_tokens_details')}") print(f" Output Tokens: {usage.get('output_tokens')}") if "output_tokens_details" in usage: print(f" Output Tokens Details: {usage.get('output_tokens_details')}") print(f" Total Tokens: {usage.get('total_tokens')}") print("Response Text: ", result.get("text")) # Print tools information tools = result.get("tools") if tools: print("\nTools:") print(tools) # Print reasoning and tool call outputs outputs = result.get("output", []) for output in outputs: output_type = output.get("type") if output_type == "reasoning": print("\nReasoning Output:") print(output) elif output_type == "computer_call": print("\nTool Call Output:") print(output) print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}") except Exception as e: logger.error(f"Error in run_agent_example: {e}") traceback.print_exc() raise def main(): """Run the Anthropic agent example.""" try: load_dotenv_files() # Register signal handler for graceful exit signal.signal(signal.SIGINT, handle_sigint) asyncio.run(run_agent_example()) except Exception as e: print(f"Error running example: {e}") traceback.print_exc() if __name__ == "__main__": main() ``` ## /examples/agent_ui_examples.py ```py path="/examples/agent_ui_examples.py" #!/usr/bin/env python3 """ Simple example script for the Computer-Use Agent Gradio UI. This script launches the advanced Gradio UI for the Computer-Use Agent with full model selection and configuration options. It can be run directly from the command line. """ from utils import load_dotenv_files load_dotenv_files() # Import the create_gradio_ui function from agent.ui.gradio.app import create_gradio_ui if __name__ == "__main__": print("Launching Computer-Use Agent Gradio UI with advanced features...") app = create_gradio_ui() app.launch(share=False) ``` ## /examples/computer_examples.py ```py path="/examples/computer_examples.py" import os import asyncio from pathlib import Path import sys import json import traceback # Load environment variables from .env file project_root = Path(__file__).parent.parent env_file = project_root / ".env" print(f"Loading environment from: {env_file}") from dotenv import load_dotenv load_dotenv(env_file) # Add paths to sys.path if needed pythonpath = os.environ.get("PYTHONPATH", "") for path in pythonpath.split(":"): if path and path not in sys.path: sys.path.append(path) print(f"Added to sys.path: {path}") from computer.computer import Computer from computer.logger import LogLevel from computer.utils import get_image_size async def main(): try: print("\n=== Using direct initialization ===") # Create computer with configured host computer = Computer( display="1024x768", # Higher resolution memory="8GB", # More memory cpu="4", # More CPU cores os="macos", verbosity=LogLevel.NORMAL, # Use QUIET to suppress most logs use_host_computer_server=False, ) try: await computer.run() await computer.interface.hotkey("command", "space") # res = await computer.interface.run_command("touch ./Downloads/empty_file") # print(f"Run command result: {res}") accessibility_tree = await computer.interface.get_accessibility_tree() print(f"Accessibility tree: {accessibility_tree}") # Screen Actions Examples # print("\n=== Screen Actions ===") # screenshot = await computer.interface.screenshot() # with open("screenshot_direct.png", "wb") as f: # f.write(screenshot) screen_size = await computer.interface.get_screen_size() print(f"Screen size: {screen_size}") # Demonstrate coordinate conversion center_x, center_y = 733, 736 print(f"Center in screen coordinates: ({center_x}, {center_y})") screenshot_center = await computer.to_screenshot_coordinates(center_x, center_y) print(f"Center in screenshot coordinates: {screenshot_center}") screen_center = await computer.to_screen_coordinates(*screenshot_center) print(f"Back to screen coordinates: {screen_center}") # Mouse Actions Examples print("\n=== Mouse Actions ===") await computer.interface.move_cursor(100, 100) await computer.interface.left_click() await computer.interface.right_click(300, 300) await computer.interface.double_click(400, 400) # Keyboard Actions Examples print("\n=== Keyboard Actions ===") await computer.interface.type_text("Hello, World!") await computer.interface.press_key("enter") # Clipboard Actions Examples print("\n=== Clipboard Actions ===") await computer.interface.set_clipboard("Test clipboard") content = await computer.interface.copy_to_clipboard() print(f"Clipboard content: {content}") finally: # Important to clean up resources pass # await computer.stop() except Exception as e: print(f"Error in main: {e}") traceback.print_exc() if __name__ == "__main__": asyncio.run(main()) ``` ## /examples/computer_ui_examples.py ```py path="/examples/computer_ui_examples.py" #!/usr/bin/env python3 """ Simple example script for the Computer Interface Gradio UI. This script launches the advanced Gradio UI for the Computer Interface with full model selection and configuration options. It can be run directly from the command line. """ from utils import load_dotenv_files load_dotenv_files() # Import the create_gradio_ui function from computer.ui.gradio.app import create_gradio_ui if __name__ == "__main__": print("Launching Computer Interface Gradio UI with advanced features...") app = create_gradio_ui() app.launch(share=False) # Optional: Using the saved dataset # import datasets # from computer.ui.utils import convert_to_unsloth # ds = datasets.load_dataset("ddupont/highquality-cua-demonstrations") # ds = convert_to_unsloth(ds) ``` ## /examples/pylume_examples.py ```py path="/examples/pylume_examples.py" import asyncio from pylume import PyLume, ImageRef, VMRunOpts, SharedDirectory, VMConfig, VMUpdateOpts async def main(): """Example usage of PyLume.""" async with PyLume(port=3000, use_existing_server=False, debug=True) as pylume: # Get latest IPSW URL print("\n=== Getting Latest IPSW URL ===") url = await pylume.get_latest_ipsw_url() print("Latest IPSW URL:", url) # Create a new VM print("\n=== Creating a new VM ===") vm_config = VMConfig( name="lume-vm-new", os="macOS", cpu=2, memory="4GB", disk_size="64GB", # type: ignore display="1024x768", ipsw="latest", ) await pylume.create_vm(vm_config) # Get latest IPSW URL print("\n=== Getting Latest IPSW URL ===") url = await pylume.get_latest_ipsw_url() print("Latest IPSW URL:", url) # List available images print("\n=== Listing Available Images ===") images = await pylume.get_images() print("Available Images:", images) # List all VMs to verify creation print("\n=== Listing All VMs ===") vms = await pylume.list_vms() print("VMs:", vms) # Get specific VM details print("\n=== Getting VM Details ===") vm = await pylume.get_vm("lume-vm") print("VM Details:", vm) # Update VM settings print("\n=== Updating VM Settings ===") update_opts = VMUpdateOpts(cpu=8, memory="4GB") await pylume.update_vm("lume-vm", update_opts) # Pull an image image_ref = ImageRef( image="macos-sequoia-vanilla", tag="latest", registry="ghcr.io", organization="trycua" ) await pylume.pull_image(image_ref, name="lume-vm-pulled") # Run with shared directory run_opts = VMRunOpts( no_display=False, # type: ignore shared_directories=[ # type: ignore SharedDirectory(host_path="~/shared", read_only=False) # type: ignore ], ) await pylume.run_vm("lume-vm", run_opts) # Or simpler: await pylume.run_vm("lume-vm") # Clone VM print("\n=== Cloning VM ===") await pylume.clone_vm("lume-vm", "lume-vm-cloned") # Stop VM print("\n=== Stopping VM ===") await pylume.stop_vm("lume-vm") # Delete VM print("\n=== Deleting VM ===") await pylume.delete_vm("lume-vm-cloned") if __name__ == "__main__": asyncio.run(main()) ``` ## /examples/som_examples.py ```py path="/examples/som_examples.py" #!/usr/bin/env python3 """ Example script demonstrating the usage of OmniParser's UI element detection functionality. This script shows how to: 1. Initialize the OmniParser 2. Load and process images 3. Visualize detection results 4. Compare performance between CPU and MPS (Apple Silicon) """ import argparse import logging import sys from pathlib import Path import time from PIL import Image from typing import Dict, Any, List, Optional import numpy as np import io import base64 import glob import os # Load environment variables from .env file project_root = Path(__file__).parent.parent env_file = project_root / ".env" print(f"Loading environment from: {env_file}") from dotenv import load_dotenv load_dotenv(env_file) # Add paths to sys.path if needed pythonpath = os.environ.get("PYTHONPATH", "") for path in pythonpath.split(":"): if path and path not in sys.path: sys.path.append(path) print(f"Added to sys.path: {path}") # Add the libs directory to the path to find som libs_path = project_root / "libs" if str(libs_path) not in sys.path: sys.path.append(str(libs_path)) print(f"Added to sys.path: {libs_path}") from som import OmniParser, ParseResult, IconElement, TextElement from som.models import UIElement, ParserMetadata, BoundingBox # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger(__name__) def setup_logging(): """Configure logging with a nice format.""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) class Timer: """Enhanced context manager for timing code blocks.""" def __init__(self, name: str, logger): self.name = name self.logger = logger self.start_time: float = 0.0 self.elapsed_time: float = 0.0 def __enter__(self): self.start_time = time.time() return self def __exit__(self, *args): self.elapsed_time = time.time() - self.start_time self.logger.info(f"{self.name}: {self.elapsed_time:.3f}s") return False def image_to_bytes(image: Image.Image) -> bytes: """Convert PIL Image to PNG bytes.""" buf = io.BytesIO() image.save(buf, format="PNG") return buf.getvalue() def process_image( parser: OmniParser, image_path: str, output_dir: Path, use_ocr: bool = False ) -> None: """Process a single image and save the result.""" try: # Load image logger.info(f"Processing image: {image_path}") image = Image.open(image_path).convert("RGB") logger.info(f"Image loaded successfully, size: {image.size}") # Create output filename input_filename = Path(image_path).stem output_path = output_dir / f"{input_filename}_analyzed.png" # Convert image to PNG bytes image_bytes = image_to_bytes(image) # Process image with Timer(f"Processing {input_filename}", logger): result = parser.parse(image_bytes, use_ocr=use_ocr) logger.info( f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements" ) # Save the annotated image logger.info(f"Saving annotated image to: {output_path}") try: # Save image from base64 img_data = base64.b64decode(result.annotated_image_base64) img = Image.open(io.BytesIO(img_data)) img.save(output_path) # Print detailed results logger.info("\nDetected Elements:") for elem in result.elements: if isinstance(elem, IconElement): logger.info( f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" ) elif isinstance(elem, TextElement): logger.info( f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" ) # Verify file exists and log size if output_path.exists(): logger.info( f"Successfully saved image. File size: {output_path.stat().st_size} bytes" ) else: logger.error(f"Failed to verify file at {output_path}") except Exception as e: logger.error(f"Error saving image: {str(e)}", exc_info=True) except Exception as e: logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True) def run_detection_benchmark( input_path: str, output_dir: Path, use_ocr: bool = False, box_threshold: float = 0.01, iou_threshold: float = 0.1, ): """Run detection benchmark on images.""" logger.info( f"Starting benchmark with OCR enabled: {use_ocr}, box_threshold: {box_threshold}, iou_threshold: {iou_threshold}" ) try: # Initialize parser logger.info("Initializing OmniParser...") parser = OmniParser() # Create output directory output_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Output directory created at: {output_dir}") # Get list of PNG files if os.path.isdir(input_path): image_files = glob.glob(os.path.join(input_path, "*.png")) else: image_files = [input_path] logger.info(f"Found {len(image_files)} images to process") # Process each image with specified thresholds for image_path in image_files: try: # Load image logger.info(f"Processing image: {image_path}") image = Image.open(image_path).convert("RGB") logger.info(f"Image loaded successfully, size: {image.size}") # Create output filename input_filename = Path(image_path).stem output_path = output_dir / f"{input_filename}_analyzed.png" # Convert image to PNG bytes image_bytes = image_to_bytes(image) # Process image with specified thresholds with Timer(f"Processing {input_filename}", logger): result = parser.parse( image_bytes, use_ocr=use_ocr, box_threshold=box_threshold, iou_threshold=iou_threshold, ) logger.info( f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements" ) # Save the annotated image logger.info(f"Saving annotated image to: {output_path}") try: # Save image from base64 img_data = base64.b64decode(result.annotated_image_base64) img = Image.open(io.BytesIO(img_data)) img.save(output_path) # Print detailed results logger.info("\nDetected Elements:") for elem in result.elements: if isinstance(elem, IconElement): logger.info( f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" ) elif isinstance(elem, TextElement): logger.info( f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}" ) # Verify file exists and log size if output_path.exists(): logger.info( f"Successfully saved image. File size: {output_path.stat().st_size} bytes" ) else: logger.error(f"Failed to verify file at {output_path}") except Exception as e: logger.error(f"Error saving image: {str(e)}", exc_info=True) except Exception as e: logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True) except Exception as e: logger.error(f"Benchmark failed: {str(e)}", exc_info=True) raise def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False): """Run experiments with different threshold combinations.""" # Define threshold values to test box_thresholds = [0.01, 0.05, 0.1, 0.3] iou_thresholds = [0.05, 0.1, 0.2, 0.5] logger.info("Starting threshold experiments...") logger.info("Box thresholds to test: %s", box_thresholds) logger.info("IOU thresholds to test: %s", iou_thresholds) # Create results directory for this experiment timestamp = time.strftime("%Y%m%d-%H%M%S") ocr_suffix = "_ocr" if use_ocr else "_no_ocr" exp_dir = output_dir / f"experiment_{timestamp}{ocr_suffix}" exp_dir.mkdir(parents=True, exist_ok=True) # Create a summary file summary_file = exp_dir / "results_summary.txt" with open(summary_file, "w") as f: f.write("Threshold Experiments Results\n") f.write("==========================\n\n") f.write(f"Input: {input_path}\n") f.write(f"OCR Enabled: {use_ocr}\n") f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write("Results:\n") f.write("-" * 80 + "\n") f.write( f"{'Box Thresh':^10} | {'IOU Thresh':^10} | {'Num Icons':^10} | {'Num Text':^10} | {'Time (s)':^10}\n" ) f.write("-" * 80 + "\n") # Initialize parser once for all experiments parser = OmniParser() # Run experiments with each combination for box_thresh in box_thresholds: for iou_thresh in iou_thresholds: logger.info(f"\nTesting box_threshold={box_thresh}, iou_threshold={iou_thresh}") # Create directory for this combination combo_dir = exp_dir / f"box_{box_thresh}_iou_{iou_thresh}" combo_dir.mkdir(exist_ok=True) try: # Process each image if os.path.isdir(input_path): image_files = glob.glob(os.path.join(input_path, "*.png")) else: image_files = [input_path] total_icons = 0 total_text = 0 total_time = 0 for image_path in image_files: # Load and process image image = Image.open(image_path).convert("RGB") image_bytes = image_to_bytes(image) # Process with current thresholds with Timer(f"Processing {Path(image_path).stem}", logger) as t: result = parser.parse( image_bytes, use_ocr=use_ocr, box_threshold=box_thresh, iou_threshold=iou_thresh, ) # Save annotated image output_path = combo_dir / f"{Path(image_path).stem}_analyzed.png" img_data = base64.b64decode(result.annotated_image_base64) img = Image.open(io.BytesIO(img_data)) img.save(output_path) # Update totals total_icons += result.metadata.num_icons total_text += result.metadata.num_text # Log detailed results detail_file = combo_dir / f"{Path(image_path).stem}_details.txt" with open(detail_file, "w") as detail_f: detail_f.write(f"Results for {Path(image_path).name}\n") detail_f.write("-" * 40 + "\n") detail_f.write(f"Number of icons: {result.metadata.num_icons}\n") detail_f.write( f"Number of text elements: {result.metadata.num_text}\n\n" ) detail_f.write("Icon Detections:\n") icon_count = 1 text_count = ( result.metadata.num_icons + 1 ) # Text boxes start after icons # First list all icons for elem in result.elements: if isinstance(elem, IconElement): detail_f.write(f"Box #{icon_count}: Icon\n") detail_f.write(f" - Confidence: {elem.confidence:.3f}\n") detail_f.write( f" - Coordinates: {elem.bbox.coordinates}\n" ) icon_count += 1 if use_ocr: detail_f.write("\nText Detections:\n") for elem in result.elements: if isinstance(elem, TextElement): detail_f.write(f"Box #{text_count}: Text\n") detail_f.write(f" - Content: '{elem.content}'\n") detail_f.write( f" - Confidence: {elem.confidence:.3f}\n" ) detail_f.write( f" - Coordinates: {elem.bbox.coordinates}\n" ) text_count += 1 # Update timing totals total_time += t.elapsed_time # Write summary for this combination avg_time = total_time / len(image_files) f.write( f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {total_icons:^10d} | {total_text:^10d} | {avg_time:^10.3f}\n" ) except Exception as e: logger.error( f"Error in experiment box={box_thresh}, iou={iou_thresh}: {str(e)}" ) f.write( f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {'ERROR':^10s} | {'ERROR':^10s} | {'ERROR':^10s}\n" ) # Write summary footer f.write("-" * 80 + "\n") f.write("\nExperiment completed successfully!\n") logger.info(f"\nExperiment results saved to {exp_dir}") logger.info(f"Summary file: {summary_file}") def main(): """Main entry point.""" parser = argparse.ArgumentParser(description="Run OmniParser benchmark") parser.add_argument("input_path", help="Path to input image or directory containing images") parser.add_argument( "--output-dir", default="examples/output", help="Output directory for annotated images" ) parser.add_argument( "--ocr", choices=["none", "easyocr"], default="none", help="OCR engine to use (default: none)", ) parser.add_argument( "--mode", choices=["single", "experiment"], default="single", help="Run mode: single run or threshold experiments (default: single)", ) parser.add_argument( "--box-threshold", type=float, default=0.01, help="Confidence threshold for detection (default: 0.01)", ) parser.add_argument( "--iou-threshold", type=float, default=0.1, help="IOU threshold for Non-Maximum Suppression (default: 0.1)", ) args = parser.parse_args() logger.info(f"Starting OmniParser with arguments: {args}") use_ocr = args.ocr != "none" output_dir = Path(args.output_dir) try: if args.mode == "experiment": run_experiments(args.input_path, output_dir, use_ocr) else: run_detection_benchmark( args.input_path, output_dir, use_ocr, args.box_threshold, args.iou_threshold ) except Exception as e: logger.error(f"Process failed: {str(e)}", exc_info=True) return 1 return 0 if __name__ == "__main__": sys.exit(main()) ``` ## /examples/utils.py ```py path="/examples/utils.py" """Utility functions for example scripts.""" import os import sys import signal from pathlib import Path from typing import Optional def load_env_file(path: Path) -> bool: """Load environment variables from a file. Args: path: Path to the .env file Returns: True if file was loaded successfully, False otherwise """ if not path.exists(): return False print(f"Loading environment from {path}") with open(path, "r") as f: for line in f: line = line.strip() if not line or line.startswith("#"): continue key, value = line.split("=", 1) os.environ[key] = value return True def load_dotenv_files(): """Load environment variables from .env files. Tries to load from .env.local first, then .env if .env.local doesn't exist. """ # Get the project root directory (parent of the examples directory) project_root = Path(__file__).parent.parent # Try loading .env.local first, then .env if .env.local doesn't exist env_local_path = project_root / ".env.local" env_path = project_root / ".env" # Load .env.local if it exists, otherwise try .env if not load_env_file(env_local_path): load_env_file(env_path) def handle_sigint(signum, frame): """Handle SIGINT (Ctrl+C) gracefully.""" print("\nExiting gracefully...") sys.exit(0) ``` ## /img/agent.png Binary file available at https://raw.githubusercontent.com/trycua/cua/refs/heads/main/img/agent.png ## /img/agent_gradio_ui.png Binary file available at https://raw.githubusercontent.com/trycua/cua/refs/heads/main/img/agent_gradio_ui.png ## /img/cli.png Binary file available at https://raw.githubusercontent.com/trycua/cua/refs/heads/main/img/cli.png ## /img/computer.png Binary file available at https://raw.githubusercontent.com/trycua/cua/refs/heads/main/img/computer.png ## /img/logo_black.png Binary file available at https://raw.githubusercontent.com/trycua/cua/refs/heads/main/img/logo_black.png ## /img/logo_white.png Binary file available at https://raw.githubusercontent.com/trycua/cua/refs/heads/main/img/logo_white.png ## /libs/agent/README.md

Shows my svg
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#) [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#) [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85) [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)

**cua-agent** is a general Computer-Use framework for running multi-app agentic workflows targeting macOS and Linux sandbox created with Cua, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). ### Get started with Agent
## Install ```bash pip install "cua-agent[all]" # or install specific loop providers pip install "cua-agent[openai]" # OpenAI Cua Loop pip install "cua-agent[anthropic]" # Anthropic Cua Loop pip install "cua-agent[uitars]" # UI-Tars support pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models) pip install "cua-agent[ui]" # Gradio UI for the agent ``` ## Run ```bash async with Computer() as macos_computer: # Create agent with loop and provider agent = ComputerAgent( computer=macos_computer, loop=AgentLoop.OPENAI, model=LLM(provider=LLMProvider.OPENAI) # or # loop=AgentLoop.ANTHROPIC, # model=LLM(provider=LLMProvider.ANTHROPIC) # or # loop=AgentLoop.OMNI, # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3") # or # loop=AgentLoop.UITARS, # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") ) tasks = [ "Look for a repository named trycua/cua on GitHub.", "Check the open issues, open the most recent one and read it.", "Clone the repository in users/lume/projects if it doesn't exist yet.", "Open the repository with an app named Cursor (on the dock, black background and white cube icon).", "From Cursor, open Composer if not already open.", "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.", ] for i, task in enumerate(tasks): print(f"\nExecuting task {i}/{len(tasks)}: {task}") async for result in agent.run(task): print(result) print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}") ``` Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA): - [Agent Notebook](../../notebooks/agent_nb.ipynb) - Complete examples and workflows ## Using the Gradio UI The agent includes a Gradio-based user interface for easier interaction.
To use it: ```bash # Install with Gradio support pip install "cua-agent[ui]" ``` ### Create a simple launcher script ```python # launch_ui.py from agent.ui.gradio.app import create_gradio_ui app = create_gradio_ui() app.launch(share=False) ``` ### Setting up API Keys For the Gradio UI to show available models, you need to set API keys as environment variables: ```bash # For OpenAI models export OPENAI_API_KEY=your_openai_key_here # For Anthropic models export ANTHROPIC_API_KEY=your_anthropic_key_here # Launch with both keys set OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py ``` Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider. ### Using Local Models You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio. If you're using a different local model server: - vLLM: `http://localhost:8000/v1` - LocalAI: `http://localhost:8080/v1` - Ollama with OpenAI compat API: `http://localhost:11434/v1` The Gradio UI provides: - Selection of different agent loops (OpenAI, Anthropic, OMNI) - Model selection for each provider - Configuration of agent parameters - Chat interface for interacting with the agent ### Using UI-TARS You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI. ## Agent Loops The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques: | Agent Loop | Supported Models | Description | Set-Of-Marks | |:-----------|:-----------------|:------------|:-------------| | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required | | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required | | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required | | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219`
• `gpt-4.5-preview`
• `gpt-4o`
• `gpt-4`
• `phi4`
• `phi4-mini`
• `gemma3`
• `...`
• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser | ## AgentResponse The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops. ```python async for result in agent.run(task): print("Response ID: ", result.get("id")) # Print detailed usage information usage = result.get("usage") if usage: print("\nUsage Details:") print(f" Input Tokens: {usage.get('input_tokens')}") if "input_tokens_details" in usage: print(f" Input Tokens Details: {usage.get('input_tokens_details')}") print(f" Output Tokens: {usage.get('output_tokens')}") if "output_tokens_details" in usage: print(f" Output Tokens Details: {usage.get('output_tokens_details')}") print(f" Total Tokens: {usage.get('total_tokens')}") print("Response Text: ", result.get("text")) # Print tools information tools = result.get("tools") if tools: print("\nTools:") print(tools) # Print reasoning and tool call outputs outputs = result.get("output", []) for output in outputs: output_type = output.get("type") if output_type == "reasoning": print("\nReasoning Output:") print(output) elif output_type == "computer_call": print("\nTool Call Output:") print(output) ``` **Note on Settings Persistence:** * The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task. * This allows your preferences to persist between sessions. * API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file. * It's recommended to add `.gradio_settings.json` to your `.gitignore` file. ## /libs/agent/agent/__init__.py ```py path="/libs/agent/agent/__init__.py" """CUA (Computer Use) Agent for AI-driven computer interaction.""" import sys import logging __version__ = "0.1.0" # Initialize logging logger = logging.getLogger("cua.agent") # Initialize telemetry when the package is imported try: # Import from core telemetry for basic functions from core.telemetry import ( is_telemetry_enabled, flush, record_event, ) # Import set_dimension from our own telemetry module from .core.telemetry import set_dimension # Check if telemetry is enabled if is_telemetry_enabled(): logger.info("Telemetry is enabled") # Record package initialization record_event( "module_init", { "module": "agent", "version": __version__, "python_version": sys.version, }, ) # Set the package version as a dimension set_dimension("agent_version", __version__) # Flush events to ensure they're sent flush() else: logger.info("Telemetry is disabled") except ImportError as e: # Telemetry not available logger.warning(f"Telemetry not available: {e}") except Exception as e: # Other issues with telemetry logger.warning(f"Error initializing telemetry: {e}") from .core.types import LLMProvider, LLM from .core.factory import AgentLoop from .core.agent import ComputerAgent __all__ = ["AgentLoop", "LLMProvider", "LLM", "ComputerAgent"] ``` ## /libs/agent/agent/core/__init__.py ```py path="/libs/agent/agent/core/__init__.py" """Core agent components.""" from .factory import BaseLoop from .messages import ( BaseMessageManager, ImageRetentionConfig, ) from .callbacks import ( CallbackManager, CallbackHandler, BaseCallbackManager, ContentCallback, ToolCallback, APICallback, ) __all__ = [ "BaseLoop", "CallbackManager", "CallbackHandler", "BaseMessageManager", "ImageRetentionConfig", "BaseCallbackManager", "ContentCallback", "ToolCallback", "APICallback", ] ``` ## /libs/agent/agent/core/agent.py ```py path="/libs/agent/agent/core/agent.py" """Main entry point for computer agents.""" import asyncio import logging import os from typing import AsyncGenerator, Optional from computer import Computer from .types import LLM, AgentLoop from .types import AgentResponse from .factory import LoopFactory from .provider_config import DEFAULT_MODELS, ENV_VARS logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ComputerAgent: """A computer agent that can perform automated tasks using natural language instructions.""" def __init__( self, computer: Computer, model: LLM, loop: AgentLoop, max_retries: int = 3, screenshot_dir: Optional[str] = None, log_dir: Optional[str] = None, api_key: Optional[str] = None, save_trajectory: bool = True, trajectory_dir: str = "trajectories", only_n_most_recent_images: Optional[int] = None, verbosity: int = logging.INFO, ): """Initialize the ComputerAgent. Args: computer: Computer instance. If not provided, one will be created with default settings. max_retries: Maximum number of retry attempts. screenshot_dir: Directory to save screenshots. log_dir: Directory to save logs (set to None to disable logging to files). model: LLM object containing provider and model name. Takes precedence over provider/model_name. provider: The AI provider to use (e.g., LLMProvider.ANTHROPIC). Only used if model is None. api_key: The API key for the provider. If not provided, will look for environment variable. model_name: The model name to use. Only used if model is None. save_trajectory: Whether to save the trajectory. trajectory_dir: Directory to save the trajectory. only_n_most_recent_images: Maximum number of recent screenshots to include in API requests. verbosity: Logging level. """ # Basic agent configuration self.max_retries = max_retries self.computer = computer self.queue = asyncio.Queue() self.screenshot_dir = screenshot_dir self.log_dir = log_dir self._retry_count = 0 self._initialized = False self._in_context = False # Set logging level logger.setLevel(verbosity) # Setup logging if self.log_dir: os.makedirs(self.log_dir, exist_ok=True) logger.info(f"Created logs directory: {self.log_dir}") # Setup screenshots directory if self.screenshot_dir: os.makedirs(self.screenshot_dir, exist_ok=True) logger.info(f"Created screenshots directory: {self.screenshot_dir}") # Use the provided LLM object self.provider = model.provider actual_model_name = model.name or DEFAULT_MODELS.get(self.provider, "") self.provider_base_url = getattr(model, "provider_base_url", None) # Ensure we have a valid model name if not actual_model_name: actual_model_name = DEFAULT_MODELS.get(self.provider, "") if not actual_model_name: raise ValueError( f"No model specified for provider {self.provider} and no default found" ) # Get API key from environment if not provided actual_api_key = api_key or os.environ.get(ENV_VARS[self.provider], "") # Ollama and OpenAI-compatible APIs typically don't require an API key if ( not actual_api_key and str(self.provider) not in ["ollama", "oaicompat"] and ENV_VARS[self.provider] != "none" ): raise ValueError(f"No API key provided for {self.provider}") # Create the appropriate loop using the factory try: # Let the factory create the appropriate loop with needed components self._loop = LoopFactory.create_loop( loop_type=loop, provider=self.provider, computer=self.computer, model_name=actual_model_name, api_key=actual_api_key, save_trajectory=save_trajectory, trajectory_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, provider_base_url=self.provider_base_url, ) except ValueError as e: logger.error(f"Failed to create loop: {str(e)}") raise # Initialize the message manager from the loop self.message_manager = self._loop.message_manager logger.info( f"ComputerAgent initialized with provider: {self.provider}, model: {actual_model_name}" ) async def __aenter__(self): """Initialize the agent when used as a context manager.""" logger.info("Entering ComputerAgent context") self._in_context = True # In case the computer wasn't initialized try: # Initialize the computer only if not already initialized logger.info("Checking if computer is already initialized...") if not self.computer._initialized: logger.info("Initializing computer in __aenter__...") # Use the computer's __aenter__ directly instead of calling run() await self.computer.__aenter__() logger.info("Computer initialized in __aenter__") else: logger.info("Computer already initialized, skipping initialization") except Exception as e: logger.error(f"Error initializing computer in __aenter__: {str(e)}") raise await self.initialize() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Cleanup agent resources if needed.""" logger.info("Cleaning up agent resources") self._in_context = False # Do any necessary cleanup # We're not shutting down the computer here as it might be shared # Just log that we're exiting if exc_type: logger.error(f"Exiting agent context with error: {exc_type.__name__}: {exc_val}") else: logger.info("Exiting agent context normally") # If we have a queue, make sure to signal it's done if hasattr(self, "queue") and self.queue: await self.queue.put(None) # Signal that we're done async def initialize(self) -> None: """Initialize the agent and its components.""" if not self._initialized: # Always initialize the computer if available if self.computer and not self.computer._initialized: await self.computer.run() self._initialized = True async def run(self, task: str) -> AsyncGenerator[AgentResponse, None]: """Run a task using the computer agent. Args: task: Task description Yields: Agent response format """ try: logger.info(f"Running task: {task}") logger.info( f"Message history before task has {len(self.message_manager.messages)} messages" ) # Initialize the computer if needed if not self._initialized: await self.initialize() # Add task as a user message using the message manager self.message_manager.add_user_message([{"type": "text", "text": task}]) logger.info( f"Added task message. Message history now has {len(self.message_manager.messages)} messages" ) # Pass properly formatted messages to the loop if self._loop is None: logger.error("Loop not initialized properly") yield {"error": "Loop not initialized properly"} return # Execute the task and yield results async for result in self._loop.run(self.message_manager.messages): yield result except Exception as e: logger.error(f"Error in agent run method: {str(e)}") yield { "role": "assistant", "content": f"Error: {str(e)}", "metadata": {"title": "❌ Error"}, } ``` ## /libs/agent/agent/core/base.py ```py path="/libs/agent/agent/core/base.py" """Base loop definitions.""" import logging import asyncio from abc import ABC, abstractmethod from typing import Any, AsyncGenerator, Dict, List, Optional from agent.providers.omni.parser import ParseResult from computer import Computer from .messages import StandardMessageManager, ImageRetentionConfig from .types import AgentResponse from .experiment import ExperimentManager from .callbacks import CallbackManager, CallbackHandler logger = logging.getLogger(__name__) class BaseLoop(ABC): """Base class for agent loops that handle message processing and tool execution.""" def __init__( self, computer: Computer, model: str, api_key: str, max_tokens: int = 4096, max_retries: int = 3, retry_delay: float = 1.0, base_dir: Optional[str] = "trajectories", save_trajectory: bool = True, only_n_most_recent_images: Optional[int] = 2, callback_handlers: Optional[List[CallbackHandler]] = None, **kwargs, ): """Initialize base agent loop. Args: computer: Computer instance to control model: Model name to use api_key: API key for provider max_tokens: Maximum tokens to generate max_retries: Maximum number of retries retry_delay: Delay between retries in seconds base_dir: Base directory for saving experiment data save_trajectory: Whether to save trajectory data only_n_most_recent_images: Maximum number of recent screenshots to include in API requests **kwargs: Additional provider-specific arguments """ self.computer = computer self.model = model self.api_key = api_key self.max_tokens = max_tokens self.max_retries = max_retries self.retry_delay = retry_delay self.base_dir = base_dir self.save_trajectory = save_trajectory self.only_n_most_recent_images = only_n_most_recent_images self._kwargs = kwargs # Initialize message manager self.message_manager = StandardMessageManager( config=ImageRetentionConfig(num_images_to_keep=only_n_most_recent_images) ) # Initialize experiment manager if self.save_trajectory and self.base_dir: self.experiment_manager = ExperimentManager( base_dir=self.base_dir, only_n_most_recent_images=only_n_most_recent_images, ) # Track directories for convenience self.run_dir = self.experiment_manager.run_dir self.current_turn_dir = self.experiment_manager.current_turn_dir else: self.experiment_manager = None self.run_dir = None self.current_turn_dir = None # Initialize basic tracking self.turn_count = 0 # Initialize callback manager self.callback_manager = CallbackManager(handlers=callback_handlers or []) async def initialize(self) -> None: """Initialize both the API client and computer interface with retries.""" for attempt in range(self.max_retries): try: logger.info( f"Starting initialization (attempt {attempt + 1}/{self.max_retries})..." ) # Initialize API client await self.initialize_client() logger.info("Initialization complete.") return except Exception as e: if attempt < self.max_retries - 1: logger.warning( f"Initialization failed (attempt {attempt + 1}/{self.max_retries}): {str(e)}. Retrying..." ) await asyncio.sleep(self.retry_delay) else: logger.error( f"Initialization failed after {self.max_retries} attempts: {str(e)}" ) raise RuntimeError(f"Failed to initialize: {str(e)}") ########################################### # ABSTRACT METHODS TO BE IMPLEMENTED BY SUBCLASSES ########################################### @abstractmethod async def initialize_client(self) -> None: """Initialize the API client and any provider-specific components. This method must be implemented by subclasses to set up provider-specific clients and tools. """ raise NotImplementedError @abstractmethod def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]: """Run the agent loop with provided messages. Args: messages: List of message objects Returns: An async generator that yields agent responses """ raise NotImplementedError ########################################### # EXPERIMENT AND TRAJECTORY MANAGEMENT ########################################### def _setup_experiment_dirs(self) -> None: """Setup the experiment directory structure.""" if self.experiment_manager: # Use the experiment manager to set up directories self.experiment_manager.setup_experiment_dirs() # Update local tracking variables self.run_dir = self.experiment_manager.run_dir self.current_turn_dir = self.experiment_manager.current_turn_dir def _create_turn_dir(self) -> None: """Create a new directory for the current turn.""" if self.experiment_manager: # Use the experiment manager to create the turn directory self.experiment_manager.create_turn_dir() # Update local tracking variables self.current_turn_dir = self.experiment_manager.current_turn_dir self.turn_count = self.experiment_manager.turn_count def _log_api_call( self, call_type: str, request: Any, response: Any = None, error: Optional[Exception] = None ) -> None: """Log API call details to file. Preserves provider-specific formats for requests and responses to ensure accurate logging for debugging and analysis purposes. Args: call_type: Type of API call (e.g., 'request', 'response', 'error') request: The API request data in provider-specific format response: Optional API response data in provider-specific format error: Optional error information """ if self.experiment_manager: # Use the experiment manager to log the API call provider = getattr(self, "provider", "unknown") provider_str = str(provider) if provider else "unknown" self.experiment_manager.log_api_call( call_type=call_type, request=request, provider=provider_str, model=self.model, response=response, error=error, ) def _save_screenshot(self, img_base64: str, action_type: str = "") -> None: """Save a screenshot to the experiment directory. Args: img_base64: Base64 encoded screenshot action_type: Type of action that triggered the screenshot """ if self.experiment_manager: self.experiment_manager.save_screenshot(img_base64, action_type) ########################################### # EVENT HOOKS / CALLBACKS ########################################### async def handle_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None: """Process a screenshot through callback managers Args: screenshot_base64: Base64 encoded screenshot action_type: Type of action that triggered the screenshot """ if hasattr(self, 'callback_manager'): await self.callback_manager.on_screenshot(screenshot_base64, action_type, parsed_screen) ``` ## /libs/agent/agent/core/callbacks.py ```py path="/libs/agent/agent/core/callbacks.py" """Callback handlers for agent.""" import json import logging from abc import ABC, abstractmethod from datetime import datetime from typing import Any, Dict, List, Optional, Protocol from agent.providers.omni.parser import ParseResult logger = logging.getLogger(__name__) class ContentCallback(Protocol): """Protocol for content callbacks.""" def __call__(self, content: Dict[str, Any]) -> None: ... class ToolCallback(Protocol): """Protocol for tool callbacks.""" def __call__(self, result: Any, tool_id: str) -> None: ... class APICallback(Protocol): """Protocol for API callbacks.""" def __call__(self, request: Any, response: Any, error: Optional[Exception] = None) -> None: ... class ScreenshotCallback(Protocol): """Protocol for screenshot callbacks.""" def __call__(self, screenshot_base64: str, action_type: str = "") -> Optional[str]: ... class BaseCallbackManager(ABC): """Base class for callback managers.""" def __init__( self, content_callback: ContentCallback, tool_callback: ToolCallback, api_callback: APICallback, ): """Initialize the callback manager. Args: content_callback: Callback for content updates tool_callback: Callback for tool execution results api_callback: Callback for API interactions """ self.content_callback = content_callback self.tool_callback = tool_callback self.api_callback = api_callback @abstractmethod def on_content(self, content: Any) -> None: """Handle content updates.""" raise NotImplementedError @abstractmethod def on_tool_result(self, result: Any, tool_id: str) -> None: """Handle tool execution results.""" raise NotImplementedError @abstractmethod def on_api_interaction( self, request: Any, response: Any, error: Optional[Exception] = None ) -> None: """Handle API interactions.""" raise NotImplementedError class CallbackManager: """Manager for callback handlers.""" def __init__(self, handlers: Optional[List["CallbackHandler"]] = None): """Initialize with optional handlers. Args: handlers: List of callback handlers """ self.handlers = handlers or [] def add_handler(self, handler: "CallbackHandler") -> None: """Add a callback handler. Args: handler: Callback handler to add """ self.handlers.append(handler) async def on_action_start(self, action: str, **kwargs) -> None: """Called when an action starts. Args: action: Action name **kwargs: Additional data """ for handler in self.handlers: await handler.on_action_start(action, **kwargs) async def on_action_end(self, action: str, success: bool, **kwargs) -> None: """Called when an action ends. Args: action: Action name success: Whether the action was successful **kwargs: Additional data """ for handler in self.handlers: await handler.on_action_end(action, success, **kwargs) async def on_error(self, error: Exception, **kwargs) -> None: """Called when an error occurs. Args: error: Exception that occurred **kwargs: Additional data """ for handler in self.handlers: await handler.on_error(error, **kwargs) async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None: """Called when a screenshot is taken. Args: screenshot_base64: Base64 encoded screenshot action_type: Type of action that triggered the screenshot parsed_screen: Optional output from parsing the screenshot Returns: Modified screenshot or original if no modifications """ for handler in self.handlers: await handler.on_screenshot(screenshot_base64, action_type, parsed_screen) class CallbackHandler(ABC): """Base class for callback handlers.""" @abstractmethod async def on_action_start(self, action: str, **kwargs) -> None: """Called when an action starts. Args: action: Action name **kwargs: Additional data """ pass @abstractmethod async def on_action_end(self, action: str, success: bool, **kwargs) -> None: """Called when an action ends. Args: action: Action name success: Whether the action was successful **kwargs: Additional data """ pass @abstractmethod async def on_error(self, error: Exception, **kwargs) -> None: """Called when an error occurs. Args: error: Exception that occurred **kwargs: Additional data """ pass @abstractmethod async def on_screenshot(self, screenshot_base64: str, action_type: str = "", parsed_screen: Optional[ParseResult] = None) -> None: """Called when a screenshot is taken. Args: screenshot_base64: Base64 encoded screenshot action_type: Type of action that triggered the screenshot Returns: Optional modified screenshot """ pass class DefaultCallbackHandler(CallbackHandler): """Default implementation of CallbackHandler with no-op methods. This class implements all abstract methods from CallbackHandler, allowing subclasses to override only the methods they need. """ async def on_action_start(self, action: str, **kwargs) -> None: """Default no-op implementation.""" pass async def on_action_end(self, action: str, success: bool, **kwargs) -> None: """Default no-op implementation.""" pass async def on_error(self, error: Exception, **kwargs) -> None: """Default no-op implementation.""" pass async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None: """Default no-op implementation.""" pass ``` ## /libs/agent/agent/core/experiment.py ```py path="/libs/agent/agent/core/experiment.py" """Core experiment management for agents.""" import os import logging import base64 from io import BytesIO from datetime import datetime from typing import Any, Dict, List, Optional from PIL import Image import json import re logger = logging.getLogger(__name__) class ExperimentManager: """Manages experiment directories and logging for the agent.""" def __init__( self, base_dir: Optional[str] = None, only_n_most_recent_images: Optional[int] = None, ): """Initialize the experiment manager. Args: base_dir: Base directory for saving experiment data only_n_most_recent_images: Maximum number of recent screenshots to include in API requests """ self.base_dir = base_dir self.only_n_most_recent_images = only_n_most_recent_images self.run_dir = None self.current_turn_dir = None self.turn_count = 0 self.screenshot_count = 0 # Track all screenshots for potential API request inclusion self.screenshot_paths = [] # Set up experiment directories if base_dir is provided if self.base_dir: self.setup_experiment_dirs() def setup_experiment_dirs(self) -> None: """Setup the experiment directory structure.""" if not self.base_dir: return # Create base experiments directory if it doesn't exist os.makedirs(self.base_dir, exist_ok=True) # Create timestamped run directory timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.run_dir = os.path.join(self.base_dir, timestamp) os.makedirs(self.run_dir, exist_ok=True) logger.info(f"Created run directory: {self.run_dir}") # Create first turn directory self.create_turn_dir() def create_turn_dir(self) -> None: """Create a new directory for the current turn.""" if not self.run_dir: logger.warning("Cannot create turn directory: run_dir not set") return # Increment turn counter self.turn_count += 1 # Create turn directory with padded number turn_name = f"turn_{self.turn_count:03d}" self.current_turn_dir = os.path.join(self.run_dir, turn_name) os.makedirs(self.current_turn_dir, exist_ok=True) logger.info(f"Created turn directory: {self.current_turn_dir}") def sanitize_log_data(self, data: Any) -> Any: """Sanitize log data by replacing large binary data with placeholders. Args: data: Data to sanitize Returns: Sanitized copy of the data """ if isinstance(data, dict): result = {} for k, v in data.items(): # Special handling for 'data' field in Anthropic message source if k == "data" and isinstance(v, str) and len(v) > 1000: result[k] = f"[BASE64_DATA_LENGTH_{len(v)}]" # Special handling for the 'media_type' key which indicates we're in an image block elif k == "media_type" and "image" in str(v): result[k] = v # If we're in an image block, look for a sibling 'data' field with base64 content if ( "data" in result and isinstance(result["data"], str) and len(result["data"]) > 1000 ): result["data"] = f"[BASE64_DATA_LENGTH_{len(result['data'])}]" else: result[k] = self.sanitize_log_data(v) return result elif isinstance(data, list): return [self.sanitize_log_data(item) for item in data] elif isinstance(data, str) and len(data) > 1000 and "base64" in data.lower(): return f"[BASE64_DATA_LENGTH_{len(data)}]" else: return data def save_screenshot(self, img_base64: str, action_type: str = "") -> Optional[str]: """Save a screenshot to the experiment directory. Args: img_base64: Base64 encoded screenshot action_type: Type of action that triggered the screenshot Returns: Path to the saved screenshot or None if there was an error """ if not self.current_turn_dir: return None try: # Increment screenshot counter self.screenshot_count += 1 # Sanitize action_type to ensure valid filename # Replace characters that are not safe for filenames sanitized_action = "" if action_type: # Replace invalid filename characters with underscores sanitized_action = re.sub(r'[\\/*?:"<>|]', "_", action_type) # Limit the length to avoid excessively long filenames sanitized_action = sanitized_action[:50] # Create a descriptive filename timestamp = int(datetime.now().timestamp() * 1000) action_suffix = f"_{sanitized_action}" if sanitized_action else "" filename = f"screenshot_{self.screenshot_count:03d}{action_suffix}_{timestamp}.png" # Save directly to the turn directory filepath = os.path.join(self.current_turn_dir, filename) # Save the screenshot img_data = base64.b64decode(img_base64) with open(filepath, "wb") as f: f.write(img_data) # Keep track of the file path self.screenshot_paths.append(filepath) return filepath except Exception as e: logger.error(f"Error saving screenshot: {str(e)}") return None def save_action_visualization( self, img: Image.Image, action_name: str, details: str = "" ) -> str: """Save a visualization of an action. Args: img: Image to save action_name: Name of the action details: Additional details about the action Returns: Path to the saved image """ if not self.current_turn_dir: return "" try: # Create a descriptive filename timestamp = int(datetime.now().timestamp() * 1000) details_suffix = f"_{details}" if details else "" filename = f"vis_{action_name}{details_suffix}_{timestamp}.png" # Save directly to the turn directory filepath = os.path.join(self.current_turn_dir, filename) # Save the image img.save(filepath) # Keep track of the file path self.screenshot_paths.append(filepath) return filepath except Exception as e: logger.error(f"Error saving action visualization: {str(e)}") return "" def log_api_call( self, call_type: str, request: Any, provider: str = "unknown", model: str = "unknown", response: Any = None, error: Optional[Exception] = None, ) -> None: """Log API call details to file. Args: call_type: Type of API call (request, response, error) request: Request data provider: API provider name model: Model name response: Response data (for response logs) error: Error information (for error logs) """ if not self.current_turn_dir: logger.warning("Cannot log API call: current_turn_dir not set") return try: # Create a timestamp for the log file timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Create filename based on log type filename = f"api_call_{timestamp}_{call_type}.json" filepath = os.path.join(self.current_turn_dir, filename) # Sanitize data before logging sanitized_request = self.sanitize_log_data(request) sanitized_response = self.sanitize_log_data(response) if response is not None else None # Prepare log data log_data = { "timestamp": timestamp, "provider": provider, "model": model, "type": call_type, "request": sanitized_request, } if sanitized_response is not None: log_data["response"] = sanitized_response if error is not None: log_data["error"] = str(error) # Write to file with open(filepath, "w") as f: json.dump(log_data, f, indent=2, default=str) logger.info(f"Logged API {call_type} to {filepath}") except Exception as e: logger.error(f"Error logging API call: {str(e)}") ``` ## /libs/agent/agent/core/factory.py ```py path="/libs/agent/agent/core/factory.py" """Base agent loop implementation.""" import logging import importlib.util from typing import Dict, Optional, Type, TYPE_CHECKING, Any, cast, Callable, Awaitable from computer import Computer from .types import AgentLoop from .base import BaseLoop logger = logging.getLogger(__name__) class LoopFactory: """Factory class for creating agent loops.""" # Registry to store loop implementations _loop_registry: Dict[AgentLoop, Type[BaseLoop]] = {} @classmethod def create_loop( cls, loop_type: AgentLoop, api_key: str, model_name: str, computer: Computer, provider: Any = None, save_trajectory: bool = True, trajectory_dir: str = "trajectories", only_n_most_recent_images: Optional[int] = None, acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None, provider_base_url: Optional[str] = None, ) -> BaseLoop: """Create and return an appropriate loop instance based on type.""" if loop_type == AgentLoop.ANTHROPIC: # Lazy import AnthropicLoop only when needed try: from ..providers.anthropic.loop import AnthropicLoop except ImportError: raise ImportError( "The 'anthropic' provider is not installed. " "Install it with 'pip install cua-agent[anthropic]'" ) return AnthropicLoop( api_key=api_key, model=model_name, computer=computer, save_trajectory=save_trajectory, base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, ) elif loop_type == AgentLoop.OPENAI: # Lazy import OpenAILoop only when needed try: from ..providers.openai.loop import OpenAILoop except ImportError: raise ImportError( "The 'openai' provider is not installed. " "Install it with 'pip install cua-agent[openai]'" ) return OpenAILoop( api_key=api_key, model=model_name, computer=computer, save_trajectory=save_trajectory, base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, acknowledge_safety_check_callback=acknowledge_safety_check_callback, ) elif loop_type == AgentLoop.OMNI: # Lazy import OmniLoop and related classes only when needed try: from ..providers.omni.loop import OmniLoop from ..providers.omni.parser import OmniParser from .types import LLMProvider except ImportError: raise ImportError( "The 'omni' provider is not installed. " "Install it with 'pip install cua-agent[all]'" ) if provider is None: raise ValueError("Provider is required for OMNI loop type") # We know provider is the correct type at this point, so cast it provider_instance = cast(LLMProvider, provider) return OmniLoop( provider=provider_instance, api_key=api_key, model=model_name, computer=computer, save_trajectory=save_trajectory, base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, parser=OmniParser(), provider_base_url=provider_base_url, ) elif loop_type == AgentLoop.UITARS: # Lazy import UITARSLoop only when needed try: from ..providers.uitars.loop import UITARSLoop except ImportError: raise ImportError( "The 'uitars' provider is not installed. " "Install it with 'pip install cua-agent[all]'" ) return UITARSLoop( api_key=api_key, model=model_name, computer=computer, save_trajectory=save_trajectory, base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, provider_base_url=provider_base_url, ) else: raise ValueError(f"Unsupported loop type: {loop_type}") ``` ## /libs/agent/agent/core/messages.py ```py path="/libs/agent/agent/core/messages.py" """Message handling utilities for agent.""" import logging import json from typing import Any, Dict, List, Optional, Union, Tuple from dataclasses import dataclass import re from ..providers.omni.parser import ParseResult logger = logging.getLogger(__name__) @dataclass class ImageRetentionConfig: """Configuration for image retention in messages.""" num_images_to_keep: Optional[int] = None min_removal_threshold: int = 1 enable_caching: bool = True def should_retain_images(self) -> bool: """Check if image retention is enabled.""" return self.num_images_to_keep is not None and self.num_images_to_keep > 0 class BaseMessageManager: """Base class for message preparation and management.""" def __init__(self, image_retention_config: Optional[ImageRetentionConfig] = None): """Initialize the message manager. Args: image_retention_config: Configuration for image retention """ self.image_retention_config = image_retention_config or ImageRetentionConfig() if self.image_retention_config.min_removal_threshold < 1: raise ValueError("min_removal_threshold must be at least 1") # Track provider for message formatting self.provider = "openai" # Default provider def set_provider(self, provider: str) -> None: """Set the current provider to format messages for. Args: provider: Provider name (e.g., 'openai', 'anthropic') """ self.provider = provider.lower() def prepare_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Prepare messages by applying image retention and caching as configured. Args: messages: List of messages to prepare Returns: Prepared messages """ if self.image_retention_config.should_retain_images(): self._filter_images(messages) if self.image_retention_config.enable_caching: self._inject_caching(messages) return messages def _filter_images(self, messages: List[Dict[str, Any]]) -> None: """Filter messages to retain only the specified number of most recent images. Args: messages: Messages to filter """ # Find all tool result blocks that contain images tool_results = [ item for message in messages for item in (message["content"] if isinstance(message["content"], list) else []) if isinstance(item, dict) and item.get("type") == "tool_result" ] # Count total images total_images = sum( 1 for result in tool_results for content in result.get("content", []) if isinstance(content, dict) and content.get("type") == "image" ) # Calculate how many images to remove images_to_remove = total_images - (self.image_retention_config.num_images_to_keep or 0) images_to_remove -= images_to_remove % self.image_retention_config.min_removal_threshold # Remove oldest images first for result in tool_results: if isinstance(result.get("content"), list): new_content = [] for content in result["content"]: if isinstance(content, dict) and content.get("type") == "image": if images_to_remove > 0: images_to_remove -= 1 continue new_content.append(content) result["content"] = new_content def _inject_caching(self, messages: List[Dict[str, Any]]) -> None: """Inject caching control for recent message turns. Args: messages: Messages to inject caching into """ # Only apply cache_control for Anthropic API, not OpenAI if self.provider != "anthropic": return # Default to caching last 3 turns turns_to_cache = 3 for message in reversed(messages): if message["role"] == "user" and isinstance(content := message["content"], list): if turns_to_cache: turns_to_cache -= 1 content[-1]["cache_control"] = {"type": "ephemeral"} else: content[-1].pop("cache_control", None) break class StandardMessageManager: """Manages messages in a standardized OpenAI format across different providers.""" def __init__(self, config: Optional[ImageRetentionConfig] = None): """Initialize message manager. Args: config: Configuration for image retention """ self.messages: List[Dict[str, Any]] = [] self.config = config or ImageRetentionConfig() def add_user_message(self, content: Union[str, List[Dict[str, Any]]]) -> None: """Add a user message. Args: content: Message content (text or multimodal content) """ self.messages.append({"role": "user", "content": content}) def add_assistant_message(self, content: Union[str, List[Dict[str, Any]]]) -> None: """Add an assistant message. Args: content: Message content (text or multimodal content) """ self.messages.append({"role": "assistant", "content": content}) def add_system_message(self, content: str) -> None: """Add a system message. Args: content: System message content """ self.messages.append({"role": "system", "content": content}) def get_messages(self) -> List[Dict[str, Any]]: """Get all messages in standard format. Returns: List of messages """ # If image retention is configured, apply it if self.config.num_images_to_keep is not None: return self._apply_image_retention(self.messages) return self.messages def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Apply image retention policy to messages. Args: messages: List of messages Returns: List of messages with image retention applied """ if not self.config.num_images_to_keep: return messages # Find user messages with images image_messages = [] for msg in messages: if msg["role"] == "user" and isinstance(msg["content"], list): has_image = any( item.get("type") == "image_url" or item.get("type") == "image" for item in msg["content"] ) if has_image: image_messages.append(msg) # If we don't have more images than the limit, return all messages if len(image_messages) <= self.config.num_images_to_keep: return messages # Get the most recent N images to keep images_to_keep = image_messages[-self.config.num_images_to_keep :] images_to_remove = image_messages[: -self.config.num_images_to_keep] # Create a new message list without the older images result = [] for msg in messages: if msg in images_to_remove: # Skip this message continue result.append(msg) return result def to_anthropic_format( self, messages: List[Dict[str, Any]] ) -> Tuple[List[Dict[str, Any]], str]: """Convert standard OpenAI format messages to Anthropic format. Args: messages: List of messages in OpenAI format Returns: Tuple containing (anthropic_messages, system_content) """ result = [] system_content = "" # Process messages in order to maintain conversation flow previous_assistant_tool_use_ids = ( set() ) # Track tool_use_ids in the previous assistant message for i, msg in enumerate(messages): role = msg.get("role", "") content = msg.get("content", "") if role == "system": # Collect system messages for later use system_content += content + "\n" continue if role == "assistant": # Track tool_use_ids in this assistant message for the next user message previous_assistant_tool_use_ids = set() if isinstance(content, list): for item in content: if ( isinstance(item, dict) and item.get("type") == "tool_use" and "id" in item ): previous_assistant_tool_use_ids.add(item["id"]) logger.info( f"Tool use IDs in assistant message #{i}: {previous_assistant_tool_use_ids}" ) if role in ["user", "assistant"]: anthropic_msg = {"role": role} # Convert content based on type if isinstance(content, str): # Simple text content anthropic_msg["content"] = [{"type": "text", "text": content}] elif isinstance(content, list): # Convert complex content anthropic_content = [] for item in content: item_type = item.get("type", "") if item_type == "text": anthropic_content.append({"type": "text", "text": item.get("text", "")}) elif item_type == "image_url": # Convert OpenAI image format to Anthropic image_url = item.get("image_url", {}).get("url", "") if image_url.startswith("data:"): # Extract base64 data and media type match = re.match(r"data:(.+);base64,(.+)", image_url) if match: media_type, data = match.groups() anthropic_content.append( { "type": "image", "source": { "type": "base64", "media_type": media_type, "data": data, }, } ) else: # Regular URL anthropic_content.append( { "type": "image", "source": { "type": "url", "url": image_url, }, } ) elif item_type == "tool_use": # Always include tool_use blocks anthropic_content.append(item) elif item_type == "tool_result": # Check if this is a user message AND if the tool_use_id exists in the previous assistant message tool_use_id = item.get("tool_use_id") # Only include tool_result if it references a tool_use from the immediately preceding assistant message if ( role == "user" and tool_use_id and tool_use_id in previous_assistant_tool_use_ids ): anthropic_content.append(item) logger.info( f"Including tool_result with tool_use_id: {tool_use_id}" ) else: # Convert to text to preserve information logger.warning( f"Converting tool_result to text. Tool use ID {tool_use_id} not found in previous assistant message" ) content_text = "Tool Result: " if "content" in item: if isinstance(item["content"], list): for content_item in item["content"]: if ( isinstance(content_item, dict) and content_item.get("type") == "text" ): content_text += content_item.get("text", "") elif isinstance(item["content"], str): content_text += item["content"] anthropic_content.append({"type": "text", "text": content_text}) anthropic_msg["content"] = anthropic_content result.append(anthropic_msg) return result, system_content def from_anthropic_format(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Convert Anthropic format messages to standard OpenAI format. Args: messages: List of messages in Anthropic format Returns: List of messages in OpenAI format """ result = [] for msg in messages: role = msg.get("role", "") content = msg.get("content", []) if role in ["user", "assistant"]: openai_msg = {"role": role} # Simple case: single text block if len(content) == 1 and content[0].get("type") == "text": openai_msg["content"] = content[0].get("text", "") else: # Complex case: multiple blocks or non-text openai_content = [] for item in content: item_type = item.get("type", "") if item_type == "text": openai_content.append({"type": "text", "text": item.get("text", "")}) elif item_type == "image": # Convert Anthropic image to OpenAI format source = item.get("source", {}) if source.get("type") == "base64": media_type = source.get("media_type", "image/png") data = source.get("data", "") openai_content.append( { "type": "image_url", "image_url": {"url": f"data:{media_type};base64,{data}"}, } ) else: # URL openai_content.append( { "type": "image_url", "image_url": {"url": source.get("url", "")}, } ) elif item_type in ["tool_use", "tool_result"]: # Pass through tool-related content openai_content.append(item) openai_msg["content"] = openai_content result.append(openai_msg) return result ``` ## /libs/agent/agent/core/provider_config.py ```py path="/libs/agent/agent/core/provider_config.py" """Provider-specific configurations and constants.""" from .types import LLMProvider # Default models for different providers DEFAULT_MODELS = { LLMProvider.OPENAI: "gpt-4o", LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219", LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M", LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct", } # Map providers to their environment variable names ENV_VARS = { LLMProvider.OPENAI: "OPENAI_API_KEY", LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY", LLMProvider.OLLAMA: "none", LLMProvider.OAICOMPAT: "none", # OpenAI-compatible API typically doesn't require an API key } ``` ## /libs/agent/agent/core/telemetry.py ```py path="/libs/agent/agent/core/telemetry.py" """Agent telemetry for tracking anonymous usage and feature usage.""" import logging import os import platform import sys from typing import Dict, Any, Callable # Import the core telemetry module TELEMETRY_AVAILABLE = False # Local fallbacks in case core telemetry isn't available def _noop(*args: Any, **kwargs: Any) -> None: """No-op function for when telemetry is not available.""" pass # Define default functions with unique names to avoid shadowing _default_record_event = _noop _default_increment_counter = _noop _default_set_dimension = _noop _default_get_telemetry_client = lambda: None _default_flush = _noop _default_is_telemetry_enabled = lambda: False _default_is_telemetry_globally_disabled = lambda: True # Set the actual functions to the defaults initially record_event = _default_record_event increment_counter = _default_increment_counter set_dimension = _default_set_dimension get_telemetry_client = _default_get_telemetry_client flush = _default_flush is_telemetry_enabled = _default_is_telemetry_enabled is_telemetry_globally_disabled = _default_is_telemetry_globally_disabled logger = logging.getLogger("cua.agent.telemetry") try: # Import from core telemetry from core.telemetry import ( record_event as core_record_event, increment as core_increment, get_telemetry_client as core_get_telemetry_client, flush as core_flush, is_telemetry_enabled as core_is_telemetry_enabled, is_telemetry_globally_disabled as core_is_telemetry_globally_disabled, ) # Override the default functions with actual implementations record_event = core_record_event get_telemetry_client = core_get_telemetry_client flush = core_flush is_telemetry_enabled = core_is_telemetry_enabled is_telemetry_globally_disabled = core_is_telemetry_globally_disabled def increment_counter(counter_name: str, value: int = 1) -> None: """Wrapper for increment to maintain backward compatibility.""" if is_telemetry_enabled(): core_increment(counter_name, value) def set_dimension(name: str, value: Any) -> None: """Set a dimension that will be attached to all events.""" logger.debug(f"Setting dimension {name}={value}") TELEMETRY_AVAILABLE = True logger.info("Successfully imported telemetry") except ImportError as e: logger.warning(f"Could not import telemetry: {e}") logger.debug("Telemetry not available, using no-op functions") # Get system info once to use in telemetry SYSTEM_INFO = { "os": platform.system().lower(), "os_version": platform.release(), "python_version": platform.python_version(), } def enable_telemetry() -> bool: """Enable telemetry if available. Returns: bool: True if telemetry was successfully enabled, False otherwise """ global TELEMETRY_AVAILABLE, record_event, increment_counter, get_telemetry_client, flush, is_telemetry_enabled, is_telemetry_globally_disabled # Check if globally disabled using core function if TELEMETRY_AVAILABLE and is_telemetry_globally_disabled(): logger.info("Telemetry is globally disabled via environment variable - cannot enable") return False # Already enabled if TELEMETRY_AVAILABLE: return True # Try to import and enable try: from core.telemetry import ( record_event, increment, get_telemetry_client, flush, is_telemetry_globally_disabled, ) # Check again after import if is_telemetry_globally_disabled(): logger.info("Telemetry is globally disabled via environment variable - cannot enable") return False TELEMETRY_AVAILABLE = True logger.info("Telemetry successfully enabled") return True except ImportError as e: logger.warning(f"Could not enable telemetry: {e}") return False def is_telemetry_enabled() -> bool: """Check if telemetry is enabled. Returns: bool: True if telemetry is enabled, False otherwise """ # Use the core function if available, otherwise use our local flag if TELEMETRY_AVAILABLE: from core.telemetry import is_telemetry_enabled as core_is_enabled return core_is_enabled() return False def record_agent_initialization() -> None: """Record when an agent instance is initialized.""" if TELEMETRY_AVAILABLE and is_telemetry_enabled(): record_event("agent_initialized", SYSTEM_INFO) # Set dimensions that will be attached to all events set_dimension("os", SYSTEM_INFO["os"]) set_dimension("os_version", SYSTEM_INFO["os_version"]) set_dimension("python_version", SYSTEM_INFO["python_version"]) ``` ## /libs/agent/agent/core/tools.py ```py path="/libs/agent/agent/core/tools.py" """Tool-related type definitions.""" from enum import Enum from typing import Dict, Any, Optional from pydantic import BaseModel, ConfigDict class ToolInvocationState(str, Enum): """States for tool invocation.""" CALL = 'call' PARTIAL_CALL = 'partial-call' RESULT = 'result' class ToolInvocation(BaseModel): """Tool invocation type.""" model_config = ConfigDict(extra='forbid') state: Optional[str] = None toolCallId: str toolName: Optional[str] = None args: Optional[Dict[str, Any]] = None class ClientAttachment(BaseModel): """Client attachment type.""" name: str contentType: str url: str class ToolResult(BaseModel): """Result of a tool execution.""" model_config = ConfigDict(extra='forbid') output: Optional[str] = None error: Optional[str] = None metadata: Optional[Dict[str, Any]] = None ``` ## /libs/agent/agent/core/tools/__init__.py ```py path="/libs/agent/agent/core/tools/__init__.py" """Core tools package.""" from .base import BaseTool, ToolResult, ToolError, ToolFailure, CLIResult from .bash import BaseBashTool from .collection import ToolCollection from .computer import BaseComputerTool from .edit import BaseEditTool from .manager import BaseToolManager __all__ = [ "BaseTool", "ToolResult", "ToolError", "ToolFailure", "CLIResult", "BaseBashTool", "BaseComputerTool", "BaseEditTool", "ToolCollection", "BaseToolManager", ] ``` ## /libs/agent/agent/core/tools/base.py ```py path="/libs/agent/agent/core/tools/base.py" """Abstract base classes for tools that can be used with any provider.""" from abc import ABCMeta, abstractmethod from dataclasses import dataclass, fields, replace from typing import Any, Dict class BaseTool(metaclass=ABCMeta): """Abstract base class for provider-agnostic tools.""" name: str @abstractmethod async def __call__(self, **kwargs) -> Any: """Executes the tool with the given arguments.""" ... @abstractmethod def to_params(self) -> Dict[str, Any]: """Convert tool to provider-specific API parameters. Returns: Dictionary with tool parameters specific to the LLM provider """ raise NotImplementedError @dataclass(kw_only=True, frozen=True) class ToolResult: """Represents the result of a tool execution.""" output: str | None = None error: str | None = None base64_image: str | None = None system: str | None = None content: list[dict] | None = None def __bool__(self): return any(getattr(self, field.name) for field in fields(self)) def __add__(self, other: "ToolResult"): def combine_fields(field: str | None, other_field: str | None, concatenate: bool = True): if field and other_field: if concatenate: return field + other_field raise ValueError("Cannot combine tool results") return field or other_field return ToolResult( output=combine_fields(self.output, other.output), error=combine_fields(self.error, other.error), base64_image=combine_fields(self.base64_image, other.base64_image, False), system=combine_fields(self.system, other.system), content=self.content or other.content, # Use first non-None content ) def replace(self, **kwargs): """Returns a new ToolResult with the given fields replaced.""" return replace(self, **kwargs) class CLIResult(ToolResult): """A ToolResult that can be rendered as a CLI output.""" class ToolFailure(ToolResult): """A ToolResult that represents a failure.""" class ToolError(Exception): """Raised when a tool encounters an error.""" def __init__(self, message): self.message = message ``` ## /libs/agent/agent/core/tools/bash.py ```py path="/libs/agent/agent/core/tools/bash.py" """Abstract base bash/shell tool implementation.""" import asyncio import logging from abc import abstractmethod from typing import Any, Dict, Tuple from computer.computer import Computer from .base import BaseTool, ToolResult class BaseBashTool(BaseTool): """Base class for bash/shell command execution tools across different providers.""" name = "bash" logger = logging.getLogger(__name__) computer: Computer def __init__(self, computer: Computer): """Initialize the BashTool. Args: computer: Computer instance, may be used for related operations """ self.computer = computer async def run_command(self, command: str) -> Tuple[int, str, str]: """Run a shell command and return exit code, stdout, and stderr. Args: command: Shell command to execute Returns: Tuple containing (exit_code, stdout, stderr) """ try: process = await asyncio.create_subprocess_shell( command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await process.communicate() return process.returncode or 0, stdout.decode(), stderr.decode() except Exception as e: self.logger.error(f"Error running command: {str(e)}") return 1, "", str(e) @abstractmethod async def __call__(self, **kwargs) -> ToolResult: """Execute the tool with the provided arguments.""" raise NotImplementedError ``` ## /libs/agent/agent/core/tools/collection.py ```py path="/libs/agent/agent/core/tools/collection.py" """Collection classes for managing multiple tools.""" from typing import Any, Dict, List, Type from .base import ( BaseTool, ToolError, ToolFailure, ToolResult, ) class ToolCollection: """A collection of tools that can be used with any provider.""" def __init__(self, *tools: BaseTool): self.tools = tools self.tool_map = {tool.name: tool for tool in tools} def to_params(self) -> List[Dict[str, Any]]: """Convert all tools to provider-specific parameters. Returns: List of dictionaries with tool parameters """ return [tool.to_params() for tool in self.tools] async def run(self, *, name: str, tool_input: Dict[str, Any]) -> ToolResult: """Run a tool with the given input. Args: name: Name of the tool to run tool_input: Input parameters for the tool Returns: Result of the tool execution """ tool = self.tool_map.get(name) if not tool: return ToolFailure(error=f"Tool {name} is invalid") try: return await tool(**tool_input) except ToolError as e: return ToolFailure(error=e.message) except Exception as e: return ToolFailure(error=f"Unexpected error in tool {name}: {str(e)}") ``` ## /libs/agent/agent/core/tools/computer.py ```py path="/libs/agent/agent/core/tools/computer.py" """Abstract base computer tool implementation.""" import asyncio import base64 import io import logging from abc import abstractmethod from typing import Any, Dict, Optional, Tuple from PIL import Image from computer.computer import Computer from .base import BaseTool, ToolError, ToolResult class BaseComputerTool(BaseTool): """Base class for computer interaction tools across different providers.""" name = "computer" logger = logging.getLogger(__name__) width: Optional[int] = None height: Optional[int] = None display_num: Optional[int] = None computer: Computer _screenshot_delay = 1.0 # Default delay for most platforms _scaling_enabled = True def __init__(self, computer: Computer): """Initialize the ComputerTool. Args: computer: Computer instance for screen interactions """ self.computer = computer async def initialize_dimensions(self): """Initialize screen dimensions from the computer interface.""" display_size = await self.computer.interface.get_screen_size() self.width = display_size["width"] self.height = display_size["height"] self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}") @property def options(self) -> Dict[str, Any]: """Get the options for the tool. Returns: Dictionary with tool options """ if self.width is None or self.height is None: raise RuntimeError( "Screen dimensions not initialized. Call initialize_dimensions() first." ) return { "display_width_px": self.width, "display_height_px": self.height, "display_number": self.display_num, } async def resize_screenshot_if_needed(self, screenshot: bytes) -> bytes: """Resize a screenshot to match the expected dimensions. Args: screenshot: Raw screenshot data Returns: Resized screenshot data """ if self.width is None or self.height is None: raise ToolError("Screen dimensions not initialized") try: img = Image.open(io.BytesIO(screenshot)) if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info): img = img.convert("RGB") # Resize if dimensions don't match if img.size != (self.width, self.height): self.logger.info( f"Scaling image from {img.size} to {self.width}x{self.height} to match screen dimensions" ) img = img.resize((self.width, self.height), Image.Resampling.LANCZOS) # Save back to bytes buffer = io.BytesIO() img.save(buffer, format="PNG") return buffer.getvalue() return screenshot except Exception as e: self.logger.error(f"Error during screenshot resizing: {str(e)}") raise ToolError(f"Failed to resize screenshot: {str(e)}") async def screenshot(self) -> ToolResult: """Take a screenshot and return it as a ToolResult with base64-encoded image. Returns: ToolResult with the screenshot """ try: screenshot = await self.computer.interface.screenshot() screenshot = await self.resize_screenshot_if_needed(screenshot) return ToolResult(base64_image=base64.b64encode(screenshot).decode()) except Exception as e: self.logger.error(f"Error taking screenshot: {str(e)}") return ToolResult(error=f"Failed to take screenshot: {str(e)}") @abstractmethod async def __call__(self, **kwargs) -> ToolResult: """Execute the tool with the provided arguments.""" raise NotImplementedError ``` ## /libs/agent/agent/core/tools/edit.py ```py path="/libs/agent/agent/core/tools/edit.py" """Abstract base edit tool implementation.""" import asyncio import logging import os from abc import abstractmethod from pathlib import Path from typing import Any, Dict, Optional from computer.computer import Computer from .base import BaseTool, ToolError, ToolResult class BaseEditTool(BaseTool): """Base class for text editor tools across different providers.""" name = "edit" logger = logging.getLogger(__name__) computer: Computer def __init__(self, computer: Computer): """Initialize the EditTool. Args: computer: Computer instance, may be used for related operations """ self.computer = computer async def read_file(self, path: str) -> str: """Read a file and return its contents. Args: path: Path to the file to read Returns: File contents as a string """ try: path_obj = Path(path) if not path_obj.exists(): raise ToolError(f"File does not exist: {path}") return path_obj.read_text() except Exception as e: self.logger.error(f"Error reading file: {str(e)}") raise ToolError(f"Failed to read file: {str(e)}") async def write_file(self, path: str, content: str) -> None: """Write content to a file. Args: path: Path to the file to write content: Content to write to the file """ try: path_obj = Path(path) # Create parent directories if they don't exist path_obj.parent.mkdir(parents=True, exist_ok=True) path_obj.write_text(content) except Exception as e: self.logger.error(f"Error writing file: {str(e)}") raise ToolError(f"Failed to write file: {str(e)}") @abstractmethod async def __call__(self, **kwargs) -> ToolResult: """Execute the tool with the provided arguments.""" raise NotImplementedError ``` ## /libs/agent/agent/core/tools/manager.py ```py path="/libs/agent/agent/core/tools/manager.py" """Tool manager for initializing and running tools.""" from abc import ABC, abstractmethod from typing import Any, Dict, List from computer.computer import Computer from .base import BaseTool, ToolResult from .collection import ToolCollection class BaseToolManager(ABC): """Base class for tool managers across different providers.""" def __init__(self, computer: Computer): """Initialize the tool manager. Args: computer: Computer instance for computer-related tools """ self.computer = computer self.tools: ToolCollection | None = None @abstractmethod def _initialize_tools(self) -> ToolCollection: """Initialize all available tools.""" ... async def initialize(self) -> None: """Initialize tool-specific requirements and create tool collection.""" await self._initialize_tools_specific() self.tools = self._initialize_tools() @abstractmethod async def _initialize_tools_specific(self) -> None: """Initialize provider-specific tool requirements.""" ... @abstractmethod def get_tool_params(self) -> List[Dict[str, Any]]: """Get tool parameters for API calls.""" ... async def execute_tool(self, name: str, tool_input: Dict[str, Any]) -> ToolResult: """Execute a tool with the given input. Args: name: Name of the tool to execute tool_input: Input parameters for the tool Returns: Result of the tool execution """ if self.tools is None: raise RuntimeError("Tools not initialized. Call initialize() first.") return await self.tools.run(name=name, tool_input=tool_input) ``` ## /libs/agent/agent/core/types.py ```py path="/libs/agent/agent/core/types.py" """Core type definitions.""" from typing import Any, Dict, List, Optional, TypedDict, Union from enum import Enum, StrEnum, auto from dataclasses import dataclass class AgentLoop(Enum): """Enumeration of available loop types.""" ANTHROPIC = auto() # Anthropic implementation OMNI = auto() # OmniLoop implementation OPENAI = auto() # OpenAI implementation OLLAMA = auto() # OLLAMA implementation UITARS = auto() # UI-TARS implementation # Add more loop types as needed class LLMProvider(StrEnum): """Supported LLM providers.""" ANTHROPIC = "anthropic" OPENAI = "openai" OLLAMA = "ollama" OAICOMPAT = "oaicompat" @dataclass class LLM: """Configuration for LLM model and provider.""" provider: LLMProvider name: Optional[str] = None provider_base_url: Optional[str] = None def __post_init__(self): """Set default model name if not provided.""" if self.name is None: from .provider_config import DEFAULT_MODELS self.name = DEFAULT_MODELS.get(self.provider) # Set default provider URL if none provided if self.provider_base_url is None and self.provider == LLMProvider.OAICOMPAT: # Default for vLLM self.provider_base_url = "http://localhost:8000/v1" # Common alternatives: # - LM Studio: "http://localhost:1234/v1" # - LocalAI: "http://localhost:8080/v1" # - Ollama with OpenAI compatible API: "http://localhost:11434/v1" # For backward compatibility LLMModel = LLM Model = LLM class AgentResponse(TypedDict, total=False): """Agent response format.""" id: str object: str created_at: int status: str error: Optional[str] incomplete_details: Optional[Any] instructions: Optional[Any] max_output_tokens: Optional[int] model: str output: List[Dict[str, Any]] parallel_tool_calls: bool previous_response_id: Optional[str] reasoning: Dict[str, str] store: bool temperature: float text: Dict[str, Dict[str, str]] tool_choice: str tools: List[Dict[str, Union[str, int]]] top_p: float truncation: str usage: Dict[str, Any] user: Optional[str] metadata: Dict[str, Any] response: Dict[str, List[Dict[str, Any]]] # Additional fields for error responses role: str content: Union[str, List[Dict[str, Any]]] ``` The content has been capped at 50000 tokens, and files over NaN bytes have been omitted. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.