diff --git a/.github/workflows/project-41-ci.yml b/.github/workflows/project-41-ci.yml new file mode 100644 index 00000000..cc5c2b9c --- /dev/null +++ b/.github/workflows/project-41-ci.yml @@ -0,0 +1,89 @@ +name: "[P41] CI" + +on: + push: + branches: [devops-project, master] + paths: + - "DevOps-Project-41/app/**" + - ".github/workflows/project-41-ci.yml" + pull_request: + branches: [master] + paths: + - "DevOps-Project-41/app/**" + - ".github/workflows/project-41-ci.yml" + +permissions: + contents: read + +env: + APP_DIR: DevOps-Project-41/app + +jobs: + build-test: + name: Build & Test + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: "8.0.x" + + - name: Restore + working-directory: ${{ env.APP_DIR }} + run: dotnet restore + + - name: Build + working-directory: ${{ env.APP_DIR }} + run: dotnet build -c Release --no-restore + + - name: Test + working-directory: ${{ env.APP_DIR }} + run: dotnet test -c Release --no-build --logger trx --results-directory TestResults + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results + path: ${{ env.APP_DIR }}/TestResults + + docker-build: + name: Docker Build (smoke test) + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build API image + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: api + push: false + load: true + tags: ai-api:ci + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Build Worker image + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: worker + push: false + load: true + tags: ai-worker:ci + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/project-41-release.yml b/.github/workflows/project-41-release.yml new file mode 100644 index 00000000..18d15596 --- /dev/null +++ b/.github/workflows/project-41-release.yml @@ -0,0 +1,163 @@ +name: "[P41] Release" + +on: + push: + tags: + - "p41-v*.*.*" + workflow_dispatch: + inputs: + tag: + description: "Image tag (e.g. 1.0.0)" + required: true + default: "1.0.0" + +permissions: + contents: read + packages: write + id-token: write + security-events: write + +env: + REGISTRY: ghcr.io + IMAGE_OWNER: ${{ github.repository_owner }} + APP_DIR: DevOps-Project-41/app + +jobs: + build-push-sign: + name: Build, Push and Sign + runs-on: ubuntu-latest + outputs: + api-digest: ${{ steps.push-api.outputs.digest }} + worker-digest: ${{ steps.push-worker.outputs.digest }} + image-tag: ${{ steps.meta.outputs.version }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + + - name: Extract image metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha,prefix=sha- + + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: "8.0.x" + + - name: Run tests before release + working-directory: ${{ env.APP_DIR }} + run: dotnet test -c Release --logger trx + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push API image + id: push-api + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: api + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api:${{ steps.meta.outputs.version }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Build and push Worker image + id: push-worker + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: worker + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker:${{ steps.meta.outputs.version }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Install Cosign + uses: sigstore/cosign-installer@v3 + + - name: Sign API image (keyless) + run: | + cosign sign --yes \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + + - name: Sign Worker image (keyless) + run: | + cosign sign --yes \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} + + - name: Generate SBOM for API (SPDX) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + format: spdx-json + output: sbom-api.spdx.json + + - name: Generate SBOM for API (CycloneDX) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + format: cyclonedx + output: sbom-api.cyclonedx.json + + - name: Generate SBOM for Worker (SPDX) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} + format: spdx-json + output: sbom-worker.spdx.json + + - name: Attest SBOM for API + run: | + cosign attest --yes \ + --predicate sbom-api.spdx.json \ + --type spdxjson \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + + - name: Attest SBOM for Worker + run: | + cosign attest --yes \ + --predicate sbom-worker.spdx.json \ + --type spdxjson \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} + + - name: Upload SBOMs as artefacts + uses: actions/upload-artifact@v4 + with: + name: sbom-${{ steps.meta.outputs.version }} + path: "sbom-*.json" + + - name: Verify API signature + run: | + cosign verify \ + --certificate-identity-regexp "https://github.com/${{ github.repository }}/.github/workflows/project-41-release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + + - name: Verify Worker signature + run: | + cosign verify \ + --certificate-identity-regexp "https://github.com/${{ github.repository }}/.github/workflows/project-41-release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} diff --git a/.github/workflows/project-41-security.yml b/.github/workflows/project-41-security.yml new file mode 100644 index 00000000..f277263e --- /dev/null +++ b/.github/workflows/project-41-security.yml @@ -0,0 +1,109 @@ +name: "[P41] Security Scanning" + +on: + push: + branches: [devops-project, master] + paths: + - "DevOps-Project-41/app/**" + - "DevOps-Project-41/k8s/**" + - ".github/workflows/project-41-security.yml" + schedule: + - cron: "0 6 * * 1" # Weekly on Monday at 06:00 UTC + workflow_dispatch: + +permissions: + contents: read + security-events: write + +env: + APP_DIR: DevOps-Project-41/app + +jobs: + trivy-filesystem: + name: Trivy Filesystem Scan + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Trivy filesystem scan + uses: aquasecurity/trivy-action@0.24.0 + with: + scan-type: fs + scan-ref: DevOps-Project-41 + format: sarif + output: trivy-fs.sarif + severity: HIGH,CRITICAL + exit-code: "0" + + - name: Upload SARIF to GitHub Security + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: trivy-fs.sarif + category: trivy-filesystem + + trivy-config: + name: Trivy Kubernetes Manifest Scan + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Trivy config scan on k8s manifests + uses: aquasecurity/trivy-action@0.24.0 + with: + scan-type: config + scan-ref: DevOps-Project-41/k8s + format: sarif + output: trivy-config.sarif + severity: HIGH,CRITICAL + exit-code: "0" + + - name: Upload SARIF to GitHub Security + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: trivy-config.sarif + category: trivy-config + + trivy-image: + name: Trivy Image Scan + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build API image for scanning + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: api + push: false + load: true + tags: ai-api:scan + cache-from: type=gha + + - name: Scan API image with Trivy + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ai-api:scan + format: sarif + output: trivy-image-api.sarif + severity: HIGH,CRITICAL + exit-code: "0" + + - name: Upload API image SARIF + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: trivy-image-api.sarif + category: trivy-image-api + + - name: Upload scan reports as artefacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: trivy-reports + path: "*.sarif" diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..94bf7519 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# Build artifacts +bin/ +obj/ +out/ + +# IDE +.vs/ +.vscode/ +.idea/ +*.user +*.suo + +# .NET +TestResults/ +*.trx +*.coverage +*.coveragexml + +# Docker +*.env +.env.* +!.env.example + +# SBOM and scan reports +sbom-*.json +*.sarif + +# Terraform +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl +*.tfplan +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Kubernetes secrets (never commit) +*secret*.yaml +!*secret*.example.yaml +!*secret*.template.yaml + +# OS +.DS_Store +Thumbs.db + +# Node (if any tooling added) +node_modules/ + +# Logs +*.log diff --git a/DevOps-Project-41/.github/workflows/ci.yml b/DevOps-Project-41/.github/workflows/ci.yml new file mode 100644 index 00000000..0000d4c9 --- /dev/null +++ b/DevOps-Project-41/.github/workflows/ci.yml @@ -0,0 +1,87 @@ +name: CI + +on: + push: + branches: [devops-project] + paths: + - "DevOps-Project-41/app/**" + - ".github/workflows/ci.yml" + pull_request: + branches: [devops-project, master] + paths: + - "DevOps-Project-41/app/**" + +permissions: + contents: read + +env: + DOTNET_VERSION: "8.0.x" + APP_DIR: DevOps-Project-41/app + +jobs: + build-and-test: + name: Build and Test + runs-on: ubuntu-latest + defaults: + run: + working-directory: ${{ env.APP_DIR }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: ${{ env.DOTNET_VERSION }} + + - name: Restore dependencies + run: dotnet restore + + - name: Build + run: dotnet build -c Release --no-restore + + - name: Run tests + run: dotnet test -c Release --no-build --no-restore --logger trx --results-directory TestResults + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results + path: ${{ env.APP_DIR }}/TestResults/*.trx + + docker-build: + name: Docker Build (validation) + runs-on: ubuntu-latest + needs: build-and-test + defaults: + run: + working-directory: ${{ env.APP_DIR }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build API image (no push) + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: api + push: false + tags: ai-api:ci + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Build Worker image (no push) + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: worker + push: false + tags: ai-worker:ci + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/DevOps-Project-41/.github/workflows/release.yml b/DevOps-Project-41/.github/workflows/release.yml new file mode 100644 index 00000000..9867065f --- /dev/null +++ b/DevOps-Project-41/.github/workflows/release.yml @@ -0,0 +1,161 @@ +name: Release + +on: + push: + tags: + - "v*.*.*" + workflow_dispatch: + inputs: + tag: + description: "Image tag (e.g. 1.0.0)" + required: true + default: "latest" + +permissions: + contents: read + packages: write + id-token: write + security-events: write + +env: + REGISTRY: ghcr.io + IMAGE_OWNER: ${{ github.repository_owner }} + APP_DIR: DevOps-Project-41/app + +jobs: + build-push-sign: + name: Build, Push and Sign + runs-on: ubuntu-latest + outputs: + api-digest: ${{ steps.push-api.outputs.digest }} + worker-digest: ${{ steps.push-worker.outputs.digest }} + image-tag: ${{ steps.meta.outputs.version }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Extract image metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha,prefix=sha- + + - name: Setup .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: "8.0.x" + + - name: Run tests before release + working-directory: ${{ env.APP_DIR }} + run: dotnet test -c Release --logger trx + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push API image + id: push-api + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: api + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api:${{ steps.meta.outputs.version }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Build and push Worker image + id: push-worker + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: worker + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker:${{ steps.meta.outputs.version }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Install Cosign + uses: sigstore/cosign-installer@v3 + + - name: Sign API image (keyless) + run: | + cosign sign --yes \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + + - name: Sign Worker image (keyless) + run: | + cosign sign --yes \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} + + - name: Generate SBOM for API (SPDX) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + format: spdx-json + output: sbom-api.spdx.json + + - name: Generate SBOM for API (CycloneDX) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + format: cyclonedx + output: sbom-api.cyclonedx.json + + - name: Generate SBOM for Worker (SPDX) + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} + format: spdx-json + output: sbom-worker.spdx.json + + - name: Attest SBOM for API + run: | + cosign attest --yes \ + --predicate sbom-api.spdx.json \ + --type spdxjson \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + + - name: Attest SBOM for Worker + run: | + cosign attest --yes \ + --predicate sbom-worker.spdx.json \ + --type spdxjson \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} + + - name: Upload SBOMs as artefacts + uses: actions/upload-artifact@v4 + with: + name: sbom-${{ steps.meta.outputs.version }} + path: "sbom-*.json" + + - name: Verify API signature + run: | + cosign verify \ + --certificate-identity-regexp "https://github.com/${{ github.repository }}/.github/workflows/release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-api@${{ steps.push-api.outputs.digest }} + + - name: Verify Worker signature + run: | + cosign verify \ + --certificate-identity-regexp "https://github.com/${{ github.repository }}/.github/workflows/release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com \ + ${{ env.REGISTRY }}/${{ env.IMAGE_OWNER }}/ai-worker@${{ steps.push-worker.outputs.digest }} diff --git a/DevOps-Project-41/.github/workflows/security.yml b/DevOps-Project-41/.github/workflows/security.yml new file mode 100644 index 00000000..0a37f6b5 --- /dev/null +++ b/DevOps-Project-41/.github/workflows/security.yml @@ -0,0 +1,110 @@ +name: Security Scanning + +on: + push: + branches: [devops-project] + paths: + - "DevOps-Project-41/app/**" + - "DevOps-Project-41/k8s/**" + - ".github/workflows/security.yml" + schedule: + - cron: "0 6 * * 1" # Weekly on Monday at 06:00 UTC + workflow_dispatch: + +permissions: + contents: read + security-events: write + +env: + APP_DIR: DevOps-Project-41/app + +jobs: + trivy-filesystem: + name: Trivy Filesystem Scan + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Trivy filesystem scan + uses: aquasecurity/trivy-action@0.24.0 + with: + scan-type: fs + scan-ref: DevOps-Project-41 + format: sarif + output: trivy-fs.sarif + severity: HIGH,CRITICAL + exit-code: "0" + + - name: Upload SARIF to GitHub Security + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: trivy-fs.sarif + category: trivy-filesystem + + trivy-config: + name: Trivy Kubernetes Manifest Scan + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Trivy config scan on k8s manifests + uses: aquasecurity/trivy-action@0.24.0 + with: + scan-type: config + scan-ref: DevOps-Project-41/k8s + format: sarif + output: trivy-config.sarif + severity: HIGH,CRITICAL + exit-code: "0" + + - name: Upload SARIF to GitHub Security + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: trivy-config.sarif + category: trivy-config + + trivy-image: + name: Trivy Image Scan + runs-on: ubuntu-latest + needs: [] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build API image for scanning + uses: docker/build-push-action@v6 + with: + context: ${{ env.APP_DIR }} + file: ${{ env.APP_DIR }}/Dockerfile + target: api + push: false + load: true + tags: ai-api:scan + cache-from: type=gha + + - name: Scan API image with Trivy + uses: aquasecurity/trivy-action@0.24.0 + with: + image-ref: ai-api:scan + format: sarif + output: trivy-image-api.sarif + severity: HIGH,CRITICAL + exit-code: "0" + + - name: Upload API image SARIF + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: trivy-image-api.sarif + category: trivy-image-api + + - name: Upload scan reports as artefacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: trivy-reports + path: "*.sarif" diff --git a/DevOps-Project-41/.gitignore b/DevOps-Project-41/.gitignore new file mode 100644 index 00000000..ab3f3ee8 --- /dev/null +++ b/DevOps-Project-41/.gitignore @@ -0,0 +1,6 @@ +/DevOps-Project-41/app/.idea +/DevOps-Project-41/app/src/AiApi/.idea +/DevOps-Project-41/app/src/AiProvider/bin +/DevOps-Project-41/app/src/AiProvider/obj +/DevOps-Project-41/app/src/AiWorker/bin +/DevOps-Project-41/app/src/AiWorker/obj \ No newline at end of file diff --git a/DevOps-Project-41/README.md b/DevOps-Project-41/README.md new file mode 100644 index 00000000..34a04747 --- /dev/null +++ b/DevOps-Project-41/README.md @@ -0,0 +1,422 @@ +# DevOps-Project-41: AI-Native DevSecOps Platform + +> **End-to-end platform for deploying AI-ready applications with GitOps, event-driven autoscaling, full-stack observability, SBOM generation, image signing and DevSecOps security controls on Kubernetes.** + +--- + +## Overview + +This project demonstrates how to build and operate a production-grade AI workload delivery platform using modern DevOps and DevSecOps practices. It combines: + +- A **.NET 8 Minimal API** that accepts AI inference requests and queues them asynchronously +- A **.NET 8 Worker** that consumes jobs from Redis, calls a configurable AI provider (mock, Ollama, or OpenAI-compatible), and persists results in PostgreSQL +- **GitHub Actions** pipelines for CI, security scanning, SBOM generation, and Cosign image signing +- **Argo CD** for GitOps-based deployment +- **KEDA** for event-driven autoscaling based on Redis queue depth (scales to zero) +- **OpenTelemetry + Prometheus + Grafana + Loki** for traces, metrics and logs +- **Kyverno** admission policies for Kubernetes security governance +- **Trivy** for vulnerability and misconfiguration scanning + +The platform runs entirely locally using `kind` with no cloud account required. + +--- + +## Architecture + +```mermaid +flowchart TD + DEV[Developer] -->|git push| GH[GitHub Repository] + + GH --> CI[GitHub Actions CI\nbuild + test + docker] + CI --> SEC[Security Workflow\nTrivy scan + SARIF] + CI --> REL[Release Workflow\nGHCR push + SBOM + Cosign sign] + REL --> GHCR[GitHub Container Registry] + REL -->|update image digest| GITOPS[GitOps Manifests\nk8s/overlays/dev] + + GITOPS --> ARGO[Argo CD\nautomated sync] + ARGO --> K8S[Kubernetes Cluster\nkind / EKS / AKS / GKE] + + subgraph K8S [Kubernetes — ai-devsecops namespace] + API[ai-api\n.NET 8 Minimal API\n:8080] + WORKER[ai-worker\n.NET 8 Worker] + REDIS[(Redis\nqueue)] + PG[(PostgreSQL\njob results)] + KEDA[KEDA\nScaledObject] + OTEL[OTel Collector\ntraces + metrics] + PROM[Prometheus] + GRAF[Grafana\ndashboard] + LOKI[Loki\nlogs] + end + + API -->|enqueue job| REDIS + API -->|insert row| PG + KEDA -->|scale based on queue depth| WORKER + WORKER -->|dequeue job| REDIS + WORKER -->|call AI provider| AI[AI Provider\nmock / Ollama / OpenAI] + WORKER -->|update result| PG + API -->|traces + metrics| OTEL + WORKER -->|traces + metrics| OTEL + OTEL --> PROM + OTEL --> GRAF + OTEL --> LOKI +``` + +### Flow summary + +| Step | What happens | +|------|-------------| +| 1 | Developer pushes code to GitHub | +| 2 | CI pipeline: restore → build → test → docker build | +| 3 | Security pipeline: Trivy filesystem + image scan → SARIF upload | +| 4 | Release pipeline: GHCR push → SBOM → Cosign keyless sign → verify | +| 5 | Pipeline updates image digest in `k8s/overlays/dev` | +| 6 | Argo CD detects change → syncs to cluster | +| 7 | `POST /ask` enqueues job to Redis, inserts row in PostgreSQL | +| 8 | KEDA detects queue depth → scales `ai-worker` replicas | +| 9 | Worker dequeues job → calls AI provider → updates PostgreSQL | +| 10 | OTel exports traces/metrics to Prometheus/Grafana/Loki | + +--- + +## Tools and Technologies + +| Tool | Purpose | +|------|---------| +| .NET 8 Minimal API | AI inference API service | +| .NET 8 Worker Service | Async job processor | +| Redis | Job queue (FIFO via Redis List) | +| PostgreSQL | Job result persistence | +| Docker / Docker Compose | Local development stack | +| kind | Local Kubernetes cluster | +| Kustomize | Kubernetes manifest management (base + overlays) | +| Argo CD | GitOps continuous deployment | +| KEDA | Event-driven autoscaling from Redis queue | +| OpenTelemetry | Distributed traces, metrics and logs | +| Prometheus | Metrics collection and alerting | +| Grafana | Metrics and trace visualisation | +| Loki | Log aggregation | +| GitHub Actions | CI/CD automation | +| GitHub Container Registry | Docker image registry | +| Trivy | Vulnerability, secret and misconfiguration scanning | +| Cosign | Keyless container image signing (Sigstore) | +| Kyverno | Kubernetes admission policies | +| Terraform | Optional cloud infrastructure (EKS/AKS/GKE) | + +--- + +## Prerequisites + +| Tool | Version | Install | +|------|---------|---------| +| Docker Desktop | ≥ 24 | [docker.com](https://www.docker.com/products/docker-desktop/) | +| kind | ≥ 0.23 | `brew install kind` | +| kubectl | ≥ 1.29 | `brew install kubectl` | +| Helm | ≥ 3.14 | `brew install helm` | +| .NET SDK | 8.0 | [dotnet.microsoft.com](https://dotnet.microsoft.com/download) | +| k6 (optional) | ≥ 0.51 | `brew install k6` | +| Trivy (optional) | ≥ 0.52 | `brew install trivy` | +| Cosign (optional) | ≥ 2.2 | `brew install cosign` | + +--- + +## Local Development (Docker Compose) + +The fastest way to run the full stack locally — no Kubernetes required. + +```bash +cd DevOps-Project-41/app + +# Start all services (api, worker, redis, postgres, otel-collector, prometheus, grafana) +docker compose up --build + +# Test the API +curl http://localhost:8080/health + +# Submit an AI job +curl -X POST http://localhost:8080/ask \ + -H "Content-Type: application/json" \ + -d '{"prompt":"Explain GitOps in simple terms","model":"mock-devops-model"}' + +# Check job result (replace JOB_ID) +curl http://localhost:8080/jobs/JOB_ID + +# Run smoke tests +cd ../tests/smoke && bash smoke-test.sh + +# Open Grafana (admin/admin) +open http://localhost:3000 + +# Open Prometheus +open http://localhost:9091 + +# Tear down +docker compose down -v +``` + +--- + +## Kubernetes Deployment (kind) + +### 1. Create the cluster + +```bash +kind create cluster --config DevOps-Project-41/infra/kind/kind-cluster.yaml +kubectl cluster-info +kubectl get nodes +``` + +### 2. Create the postgres secret + +```bash +kubectl create namespace ai-devsecops +kubectl -n ai-devsecops create secret generic postgres-secret \ + --from-literal=password=aiops-dev-password +``` + +### 3. Deploy with Kustomize (without Argo CD) + +```bash +kubectl apply -k DevOps-Project-41/k8s/overlays/dev +kubectl -n ai-devsecops get pods -w +``` + +### 4. Access the API + +```bash +kubectl -n ai-devsecops port-forward svc/ai-api 8080:80 +curl http://localhost:8080/health +``` + +--- + +## GitOps with Argo CD + +```bash +# Install Argo CD +kubectl create namespace argocd +kubectl apply -n argocd --server-side --force-conflicts \ + -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml + +# Wait for Argo CD to be ready +kubectl -n argocd wait --for=condition=Available deployment/argocd-server --timeout=120s + +# Get initial admin password +kubectl -n argocd get secret argocd-initial-admin-secret \ + -o jsonpath="{.data.password}" | base64 -d + +# Port-forward the UI +kubectl -n argocd port-forward svc/argocd-server 8080:443 + +# Apply Argo CD Application (edit GITHUB_OWNER first) +kubectl apply -f DevOps-Project-41/gitops/argocd-app-dev.yaml + +# Watch sync status +kubectl -n argocd get applications +``` + +> Edit `gitops/argocd-app-dev.yaml` and replace `GITHUB_OWNER` with your GitHub username before applying. + +--- + +## Event-Driven Autoscaling with KEDA + +```bash +# Install KEDA +helm repo add kedacore https://kedacore.github.io/charts +helm repo update +helm install keda kedacore/keda --namespace keda --create-namespace + +# Verify KEDA operator is running +kubectl -n keda get pods + +# The ScaledObject is already included in k8s/base/keda-scaledobject-worker.yaml +# After deployment, watch autoscaling: +kubectl -n ai-devsecops get scaledobject +kubectl -n ai-devsecops get hpa +kubectl -n ai-devsecops get deploy ai-worker -w + +# Generate load to trigger scaling +cd DevOps-Project-41/tests/load +API_URL=http://localhost:8080 k6 run k6-ai-jobs.js +``` + +The worker scales from 0 to up to 10 replicas when the `ai-jobs` Redis list grows, and scales back to 0 when the queue is empty. + +--- + +## Observability + +### Install Prometheus + Grafana (Helm) + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace monitoring --create-namespace \ + -f DevOps-Project-41/observability/prometheus-values.yaml + +kubectl -n monitoring port-forward svc/kube-prometheus-stack-grafana 3000:80 +``` + +### Install Loki (Helm) + +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm install loki grafana/loki-stack \ + --namespace monitoring \ + -f DevOps-Project-41/observability/loki-values.yaml +``` + +### Import Grafana Dashboard + +1. Open Grafana at `http://localhost:3000` (admin/admin) +2. Go to **Dashboards → Import** +3. Upload `DevOps-Project-41/observability/grafana-dashboard.json` + +### Dashboard panels + +- API Request Rate +- API P95 Latency +- Job Queue Depth +- Worker Replica Count +- Job Success / Failure Rate +- AI Provider Duration (p95) +- Redis Availability +- PostgreSQL Availability + +--- + +## Supply Chain Security + +### Trivy scanning + +```bash +# Filesystem scan (source + configs) +trivy fs DevOps-Project-41 --severity HIGH,CRITICAL + +# Kubernetes manifest scan +trivy config DevOps-Project-41/k8s + +# Image scan +trivy image ghcr.io/GITHUB_OWNER/ai-api:1.0.0 +``` + +### SBOM generation + +```bash +trivy image --format spdx-json --output sbom-api.spdx.json \ + ghcr.io/GITHUB_OWNER/ai-api:1.0.0 +``` + +See [security/sbom.md](security/sbom.md) for full details. + +### Cosign image verification + +```bash +cosign verify \ + ghcr.io/GITHUB_OWNER/ai-api:1.0.0 \ + --certificate-identity-regexp "https://github.com/GITHUB_OWNER/DevOps-Projects/.github/workflows/release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com +``` + +See [security/cosign.md](security/cosign.md) for full details. + +### Kyverno admission policies + +```bash +# Install Kyverno +helm repo add kyverno https://kyverno.github.io/kyverno +helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace + +# Apply policies +kubectl apply -f DevOps-Project-41/security/policies/ + +# Test — this should be blocked +kubectl -n ai-devsecops run bad-pod --image=nginx:latest --privileged=true +``` + +Policies enforce: no privileged containers, runAsNonRoot, resource limits, no latest tag, required labels. + +--- + +## Validation + +```bash +# 1. API health +curl http://localhost:8080/health # expects {"status":"healthy"} + +# 2. Full job flow +curl -X POST http://localhost:8080/ask \ + -H "Content-Type: application/json" \ + -d '{"prompt":"What is KEDA?","model":"mock-devops-model"}' +# then GET /jobs/{jobId} until status=completed + +# 3. Kubernetes pods +kubectl -n ai-devsecops get pods # all Running + +# 4. Argo CD sync +kubectl -n argocd get applications # Synced + Healthy + +# 5. KEDA scaling +kubectl -n ai-devsecops get scaledobject # READY=True + +# 6. Prometheus targets +open http://localhost:9091/targets # all UP + +# 7. Cosign verify +cosign verify ghcr.io/GITHUB_OWNER/ai-api:1.0.0 \ + --certificate-identity-regexp ".*release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com +``` + +See [docs/validation-checklist.md](docs/validation-checklist.md) for the full checklist. + +--- + +## Troubleshooting + +| Problem | Cause | Fix | +|---------|-------|-----| +| `docker compose up` fails | Port 8080 in use | `lsof -i :8080` and stop conflicting process | +| API returns 503 on `/ready` | Redis or PostgreSQL not ready | Wait for containers to start; check `docker compose logs redis` | +| Worker not processing jobs | Wrong Redis connection string | Verify `REDIS_CONNECTION_STRING` in env | +| KEDA not scaling | Redis address mismatch in ScaledObject | Check `address` field in `keda-scaledobject-worker.yaml` | +| Argo CD not syncing | Wrong `repoURL` or `path` | Edit `gitops/argocd-app-dev.yaml` with correct values | +| Cosign verify fails | Wrong workflow identity | Check `--certificate-identity-regexp` matches your repo path | +| kind cluster not starting | Docker not running | Start Docker Desktop first | + +See [docs/troubleshooting.md](docs/troubleshooting.md) for detailed guidance. + +--- + +## Cleanup + +```bash +# Remove Argo CD application +kubectl delete -f DevOps-Project-41/gitops/argocd-app-dev.yaml --ignore-not-found=true + +# Remove namespaces +kubectl delete namespace ai-devsecops --ignore-not-found=true +kubectl delete namespace argocd --ignore-not-found=true +kubectl delete namespace keda --ignore-not-found=true +kubectl delete namespace monitoring --ignore-not-found=true + +# Delete kind cluster +kind delete cluster --name ai-devsecops + +# Stop Docker Compose +cd DevOps-Project-41/app && docker compose down -v +``` + +--- + +## Future Improvements + +- Add AKS / EKS / GKE Terraform modules in `infra/terraform/` +- Integrate External Secrets Operator with a cloud secret manager +- Add Istio or Linkerd service mesh for mTLS and traffic management +- Add canary deployment with Argo Rollouts +- Define SLOs with Sloth or Pyrra +- Add a model gateway for multi-provider routing with rate limiting +- Add Kyverno policy to require signed images at admission time +- Add cost dashboard for AI workload compute diff --git a/DevOps-Project-41/app/.dockerignore b/DevOps-Project-41/app/.dockerignore new file mode 100644 index 00000000..c2bcf722 --- /dev/null +++ b/DevOps-Project-41/app/.dockerignore @@ -0,0 +1,19 @@ +.git +.gitignore +.github +.vs +.vscode +.idea +**/bin +**/obj +**/out +**/.env +**/*.env +**/*.user +**/*.suo +**/TestResults +**/node_modules +**/*.md +**/Dockerfile* +**/docker-compose* +**/.dockerignore diff --git a/DevOps-Project-41/app/AiPlatform.sln b/DevOps-Project-41/app/AiPlatform.sln new file mode 100644 index 00000000..4898f0bf --- /dev/null +++ b/DevOps-Project-41/app/AiPlatform.sln @@ -0,0 +1,40 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AiApi", "src\AiApi\AiApi.csproj", "{86BE0D76-23F2-406B-A4F6-7652EFFDC1AF}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AiWorker", "src\AiWorker\AiWorker.csproj", "{31D57DDD-AB5B-44F8-988A-4EA4DE137062}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AiProvider", "src\AiProvider\AiProvider.csproj", "{2421F9DB-BCB0-4B67-9D1F-6728B7347E56}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AiApi.Tests", "tests\AiApi.Tests\AiApi.Tests.csproj", "{0F9F347C-F9F2-4A47-929B-AE9AC2198BCF}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AiWorker.Tests", "tests\AiWorker.Tests\AiWorker.Tests.csproj", "{1DE477AB-4D6A-49BD-B599-68FBC647107C}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {86BE0D76-23F2-406B-A4F6-7652EFFDC1AF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {86BE0D76-23F2-406B-A4F6-7652EFFDC1AF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {86BE0D76-23F2-406B-A4F6-7652EFFDC1AF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {86BE0D76-23F2-406B-A4F6-7652EFFDC1AF}.Release|Any CPU.Build.0 = Release|Any CPU + {31D57DDD-AB5B-44F8-988A-4EA4DE137062}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {31D57DDD-AB5B-44F8-988A-4EA4DE137062}.Debug|Any CPU.Build.0 = Debug|Any CPU + {31D57DDD-AB5B-44F8-988A-4EA4DE137062}.Release|Any CPU.ActiveCfg = Release|Any CPU + {31D57DDD-AB5B-44F8-988A-4EA4DE137062}.Release|Any CPU.Build.0 = Release|Any CPU + {2421F9DB-BCB0-4B67-9D1F-6728B7347E56}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2421F9DB-BCB0-4B67-9D1F-6728B7347E56}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2421F9DB-BCB0-4B67-9D1F-6728B7347E56}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2421F9DB-BCB0-4B67-9D1F-6728B7347E56}.Release|Any CPU.Build.0 = Release|Any CPU + {0F9F347C-F9F2-4A47-929B-AE9AC2198BCF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0F9F347C-F9F2-4A47-929B-AE9AC2198BCF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0F9F347C-F9F2-4A47-929B-AE9AC2198BCF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0F9F347C-F9F2-4A47-929B-AE9AC2198BCF}.Release|Any CPU.Build.0 = Release|Any CPU + {1DE477AB-4D6A-49BD-B599-68FBC647107C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1DE477AB-4D6A-49BD-B599-68FBC647107C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1DE477AB-4D6A-49BD-B599-68FBC647107C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1DE477AB-4D6A-49BD-B599-68FBC647107C}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/DevOps-Project-41/app/Dockerfile b/DevOps-Project-41/app/Dockerfile new file mode 100644 index 00000000..c493da56 --- /dev/null +++ b/DevOps-Project-41/app/Dockerfile @@ -0,0 +1,57 @@ +# syntax=docker/dockerfile:1 + +# ─── Build stage ───────────────────────────────────────────────────────────── +FROM mcr.microsoft.com/dotnet/sdk:8.0-alpine AS build +WORKDIR /src + +COPY AiPlatform.sln ./ +COPY src/AiProvider/AiProvider.csproj ./src/AiProvider/ +COPY src/AiApi/AiApi.csproj ./src/AiApi/ +COPY src/AiWorker/AiWorker.csproj ./src/AiWorker/ +COPY tests/AiApi.Tests/AiApi.Tests.csproj ./tests/AiApi.Tests/ +COPY tests/AiWorker.Tests/AiWorker.Tests.csproj ./tests/AiWorker.Tests/ + +RUN dotnet restore + +COPY . . + +RUN dotnet build -c Release --no-restore +RUN dotnet test -c Release --no-build --no-restore + +# ─── Publish API ───────────────────────────────────────────────────────────── +FROM build AS publish-api +RUN dotnet publish src/AiApi/AiApi.csproj -c Release --no-build -o /app/api + +# ─── Publish Worker ────────────────────────────────────────────────────────── +FROM build AS publish-worker +RUN dotnet publish src/AiWorker/AiWorker.csproj -c Release --no-build -o /app/worker + +# ─── API runtime image ──────────────────────────────────────────────────────── +FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine AS api +WORKDIR /app + +RUN addgroup -S appgroup && adduser -S appuser -G appgroup +USER appuser + +COPY --from=publish-api --chown=appuser:appgroup /app/api . + +EXPOSE 8080 +ENV ASPNETCORE_URLS=http://+:8080 + +HEALTHCHECK --interval=15s --timeout=5s --start-period=20s --retries=3 \ + CMD wget -qO- http://localhost:8080/health || exit 1 + +ENTRYPOINT ["dotnet", "AiApi.dll"] + +# ─── Worker runtime image ───────────────────────────────────────────────────── +FROM mcr.microsoft.com/dotnet/aspnet:8.0-alpine AS worker +WORKDIR /app + +RUN addgroup -S appgroup && adduser -S appuser -G appgroup +USER appuser + +COPY --from=publish-worker --chown=appuser:appgroup /app/worker . + +EXPOSE 9090 + +ENTRYPOINT ["dotnet", "AiWorker.dll"] diff --git a/DevOps-Project-41/app/docker-compose.yml b/DevOps-Project-41/app/docker-compose.yml new file mode 100644 index 00000000..5c64ef87 --- /dev/null +++ b/DevOps-Project-41/app/docker-compose.yml @@ -0,0 +1,111 @@ +services: + + ai-api: + build: + context: . + dockerfile: Dockerfile + target: api + image: ai-native-devsecops/ai-api:local + ports: + - "8080:8080" + environment: + ASPNETCORE_ENVIRONMENT: Development + ASPNETCORE_URLS: http://+:8080 + REDIS_CONNECTION_STRING: redis:6379 + POSTGRES_CONNECTION_STRING: "Host=postgres;Database=aiops;Username=aiops;Password=aiops" + AI_PROVIDER: mock + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:8080/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + restart: on-failure + + ai-worker: + build: + context: . + dockerfile: Dockerfile + target: worker + image: ai-native-devsecops/ai-worker:local + ports: + - "9090:9090" + environment: + REDIS_CONNECTION_STRING: redis:6379 + POSTGRES_CONNECTION_STRING: "Host=postgres;Database=aiops;Username=aiops;Password=aiops" + AI_PROVIDER: mock + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + restart: on-failure + + redis: + image: redis:7.2-alpine + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 10 + restart: unless-stopped + + postgres: + image: postgres:16-alpine + ports: + - "5432:5432" + environment: + POSTGRES_DB: aiops + POSTGRES_USER: aiops + POSTGRES_PASSWORD: aiops + volumes: + - pg-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U aiops -d aiops"] + interval: 5s + timeout: 3s + retries: 10 + restart: unless-stopped + + otel-collector: + image: otel/opentelemetry-collector-contrib:0.104.0 + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ../observability/otel-collector-local.yaml:/etc/otel-collector-config.yaml:ro + ports: + - "4317:4317" + - "4318:4318" + - "8889:8889" + restart: unless-stopped + + prometheus: + image: prom/prometheus:v2.53.0 + ports: + - "9091:9090" + volumes: + - ../observability/prometheus-local.yml:/etc/prometheus/prometheus.yml:ro + restart: unless-stopped + + grafana: + image: grafana/grafana:11.1.0 + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_PASSWORD: admin + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - grafana-data:/var/lib/grafana + restart: unless-stopped + +volumes: + pg-data: + grafana-data: diff --git a/DevOps-Project-41/app/src/AiApi/AiApi.csproj b/DevOps-Project-41/app/src/AiApi/AiApi.csproj new file mode 100644 index 00000000..8a2264d4 --- /dev/null +++ b/DevOps-Project-41/app/src/AiApi/AiApi.csproj @@ -0,0 +1,28 @@ + + + + net8.0 + enable + enable + AiApi + + + + + + + + + + + + + + + + + + + + + diff --git a/DevOps-Project-41/app/src/AiApi/JobRepository.cs b/DevOps-Project-41/app/src/AiApi/JobRepository.cs new file mode 100644 index 00000000..57794b9f --- /dev/null +++ b/DevOps-Project-41/app/src/AiApi/JobRepository.cs @@ -0,0 +1,86 @@ +using Npgsql; + +namespace AiApi; + +public class JobRepository +{ + private readonly string _connectionString; + + public JobRepository(IConfiguration config) + { + _connectionString = config.GetConnectionString("Postgres") + ?? config["POSTGRES_CONNECTION_STRING"] + ?? "Host=localhost;Database=aiops;Username=aiops;Password=aiops"; + } + + public async Task EnsureSchemaAsync() + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + await using var cmd = conn.CreateCommand(); + cmd.CommandText = """ + CREATE TABLE IF NOT EXISTS ai_jobs ( + job_id TEXT PRIMARY KEY, + status TEXT NOT NULL DEFAULT 'queued', + prompt TEXT NOT NULL, + model TEXT NOT NULL, + provider TEXT, + result TEXT, + error TEXT, + duration_ms BIGINT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ + ); + """; + await cmd.ExecuteNonQueryAsync(); + } + + public async Task InsertJobAsync(string jobId, string prompt, string model) + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + await using var cmd = conn.CreateCommand(); + cmd.CommandText = "INSERT INTO ai_jobs (job_id, prompt, model, status, created_at) VALUES (@id, @prompt, @model, 'queued', NOW())"; + cmd.Parameters.AddWithValue("id", jobId); + cmd.Parameters.AddWithValue("prompt", prompt); + cmd.Parameters.AddWithValue("model", model); + await cmd.ExecuteNonQueryAsync(); + } + + public async Task GetJobAsync(string jobId) + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + await using var cmd = conn.CreateCommand(); + cmd.CommandText = "SELECT job_id, status, result, model, provider, duration_ms, created_at, completed_at, error FROM ai_jobs WHERE job_id = @id"; + cmd.Parameters.AddWithValue("id", jobId); + await using var reader = await cmd.ExecuteReaderAsync(); + if (!await reader.ReadAsync()) return null; + + return new JobStatusResponse( + reader.GetString(0), + reader.GetString(1), + reader.IsDBNull(2) ? null : reader.GetString(2), + reader.IsDBNull(3) ? null : reader.GetString(3), + reader.IsDBNull(4) ? null : reader.GetString(4), + reader.IsDBNull(5) ? null : reader.GetInt64(5), + reader.GetFieldValue(6), + reader.IsDBNull(7) ? null : reader.GetFieldValue(7), + reader.IsDBNull(8) ? null : reader.GetString(8) + ); + } + + public async Task CanConnectAsync() + { + try + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + return true; + } + catch + { + return false; + } + } +} diff --git a/DevOps-Project-41/app/src/AiApi/Models.cs b/DevOps-Project-41/app/src/AiApi/Models.cs new file mode 100644 index 00000000..06ac704e --- /dev/null +++ b/DevOps-Project-41/app/src/AiApi/Models.cs @@ -0,0 +1,25 @@ +namespace AiApi; + +public record AskRequest(string Prompt, string Model = "mock-devops-model"); + +public record AskResponse(string JobId, string Status); + +public record JobStatusResponse( + string JobId, + string Status, + string? Result, + string? Model, + string? Provider, + long? DurationMs, + DateTimeOffset CreatedAt, + DateTimeOffset? CompletedAt, + string? Error +); + +public static class JobStatus +{ + public const string Queued = "queued"; + public const string Processing = "processing"; + public const string Completed = "completed"; + public const string Failed = "failed"; +} diff --git a/DevOps-Project-41/app/src/AiApi/Program.cs b/DevOps-Project-41/app/src/AiApi/Program.cs new file mode 100644 index 00000000..753cf9a5 --- /dev/null +++ b/DevOps-Project-41/app/src/AiApi/Program.cs @@ -0,0 +1,118 @@ +using System.Diagnostics; +using AiApi; +using OpenTelemetry.Metrics; +using OpenTelemetry.Resources; +using OpenTelemetry.Trace; +using Prometheus; +using StackExchange.Redis; + +var builder = WebApplication.CreateBuilder(args); + +// Redis +var redisConnection = builder.Configuration["REDIS_CONNECTION_STRING"] ?? "localhost:6379"; +builder.Services.AddSingleton(_ => ConnectionMultiplexer.Connect(redisConnection)); + +// PostgreSQL repository +builder.Services.AddSingleton(); + +// OpenTelemetry +var otelEndpoint = builder.Configuration["OTEL_EXPORTER_OTLP_ENDPOINT"]; +builder.Services.AddOpenTelemetry() + .ConfigureResource(r => r.AddService("ai-api")) + .WithTracing(t => + { + t.AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation(); + if (!string.IsNullOrEmpty(otelEndpoint)) + t.AddOtlpExporter(o => o.Endpoint = new Uri(otelEndpoint)); + }) + .WithMetrics(m => + { + m.AddAspNetCoreInstrumentation() + .AddRuntimeInstrumentation(); + if (!string.IsNullOrEmpty(otelEndpoint)) + m.AddOtlpExporter(o => o.Endpoint = new Uri(otelEndpoint)); + }); + +var app = builder.Build(); + +// Ensure DB schema on startup +using (var scope = app.Services.CreateScope()) +{ + var repo = scope.ServiceProvider.GetRequiredService(); + try { await repo.EnsureSchemaAsync(); } + catch (Exception ex) { app.Logger.LogWarning(ex, "Could not initialise DB schema — will retry on first request"); } +} + +// Prometheus metrics endpoint +app.UseHttpMetrics(); +app.MapMetrics("/metrics"); + +// Custom counters +var jobsCreated = Metrics.CreateCounter("ai_jobs_created_total", "Total AI jobs created"); +var jobsFailed = Metrics.CreateCounter("ai_jobs_enqueue_failed_total", "Total jobs that failed to enqueue"); + +var activitySource = new ActivitySource("ai-api"); + +// GET /health +app.MapGet("/health", () => Results.Ok(new { status = "healthy", timestamp = DateTimeOffset.UtcNow })); + +// GET /ready +app.MapGet("/ready", async (IConnectionMultiplexer redis, JobRepository repo) => +{ + var redisOk = false; + var pgOk = false; + try { await redis.GetDatabase().PingAsync(); redisOk = true; } catch { } + try { pgOk = await repo.CanConnectAsync(); } catch { } + + if (redisOk && pgOk) + return Results.Ok(new { status = "ready", redis = "ok", postgres = "ok" }); + + return Results.Json( + new { status = "degraded", redis = redisOk ? "ok" : "unavailable", postgres = pgOk ? "ok" : "unavailable" }, + statusCode: 503); +}); + +// POST /ask +app.MapPost("/ask", async (AskRequest request, IConnectionMultiplexer redis, JobRepository repo, ILogger logger) => +{ + if (string.IsNullOrWhiteSpace(request.Prompt)) + return Results.BadRequest(new { error = "prompt is required" }); + + var jobId = Guid.NewGuid().ToString("N"); + + using var activity = activitySource.StartActivity("http.post.ask"); + activity?.SetTag("job.id", jobId); + activity?.SetTag("job.model", request.Model); + + try + { + await repo.InsertJobAsync(jobId, request.Prompt, request.Model); + + var db = redis.GetDatabase(); + var payload = System.Text.Json.JsonSerializer.Serialize(new { jobId, prompt = request.Prompt, model = request.Model }); + await db.ListLeftPushAsync("ai-jobs", payload); + activity?.SetTag("queue.enqueued", true); + + jobsCreated.Inc(); + logger.LogInformation("Job {JobId} enqueued with model {Model}", jobId, request.Model); + + return Results.Accepted($"/jobs/{jobId}", new AskResponse(jobId, JobStatus.Queued)); + } + catch (Exception ex) + { + jobsFailed.Inc(); + logger.LogError(ex, "Failed to enqueue job {JobId}", jobId); + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + return Results.Problem("Failed to enqueue job", statusCode: 500); + } +}); + +// GET /jobs/{jobId} +app.MapGet("/jobs/{jobId}", async (string jobId, JobRepository repo) => +{ + var job = await repo.GetJobAsync(jobId); + return job is null ? Results.NotFound(new { error = "job not found" }) : Results.Ok(job); +}); + +app.Run(); diff --git a/DevOps-Project-41/app/src/AiApi/appsettings.json b/DevOps-Project-41/app/src/AiApi/appsettings.json new file mode 100644 index 00000000..6979908b --- /dev/null +++ b/DevOps-Project-41/app/src/AiApi/appsettings.json @@ -0,0 +1,14 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning" + } + }, + "AllowedHosts": "*", + "ConnectionStrings": { + "Postgres": "Host=localhost;Database=aiops;Username=aiops;Password=aiops" + }, + "REDIS_CONNECTION_STRING": "localhost:6379", + "AI_PROVIDER": "mock" +} diff --git a/DevOps-Project-41/app/src/AiProvider/AiProvider.csproj b/DevOps-Project-41/app/src/AiProvider/AiProvider.csproj new file mode 100644 index 00000000..0cd0d9ea --- /dev/null +++ b/DevOps-Project-41/app/src/AiProvider/AiProvider.csproj @@ -0,0 +1,15 @@ + + + + net8.0 + enable + enable + + + + + + + + + diff --git a/DevOps-Project-41/app/src/AiProvider/IAiProvider.cs b/DevOps-Project-41/app/src/AiProvider/IAiProvider.cs new file mode 100644 index 00000000..c87f0359 --- /dev/null +++ b/DevOps-Project-41/app/src/AiProvider/IAiProvider.cs @@ -0,0 +1,11 @@ +namespace AiProvider; + +public interface IAiProvider +{ + Task CompleteAsync(AiRequest request, CancellationToken cancellationToken = default); + string ProviderName { get; } +} + +public record AiRequest(string Prompt, string Model, string JobId); + +public record AiResponse(string JobId, string Content, string Model, string Provider, long DurationMs); diff --git a/DevOps-Project-41/app/src/AiProvider/MockLlmProvider.cs b/DevOps-Project-41/app/src/AiProvider/MockLlmProvider.cs new file mode 100644 index 00000000..11843416 --- /dev/null +++ b/DevOps-Project-41/app/src/AiProvider/MockLlmProvider.cs @@ -0,0 +1,43 @@ +using System.Diagnostics; +using Microsoft.Extensions.Logging; + +namespace AiProvider; + +public class MockLlmProvider : IAiProvider +{ + private readonly ILogger _logger; + + private static readonly Dictionary _responses = new(StringComparer.OrdinalIgnoreCase) + { + ["gitops"] = "GitOps is a practice where Git is the single source of truth for declarative infrastructure and application configuration. Changes are made via pull requests and automatically reconciled by operators like Argo CD.", + ["kubernetes"] = "Kubernetes is an open-source container orchestration platform that automates deployment, scaling, and management of containerised applications across clusters.", + ["devsecops"] = "DevSecOps integrates security practices into the DevOps pipeline, shifting security left so that vulnerabilities are detected early in development rather than post-deployment.", + ["keda"] = "KEDA (Kubernetes Event-Driven Autoscaling) scales workloads based on external event sources such as queue lengths, HTTP requests, or custom metrics.", + ["opentelemetry"] = "OpenTelemetry is a vendor-neutral observability framework for generating, collecting, and exporting traces, metrics, and logs from distributed systems.", + }; + + public string ProviderName => "mock"; + + public MockLlmProvider(ILogger logger) + { + _logger = logger; + } + + public async Task CompleteAsync(AiRequest request, CancellationToken cancellationToken = default) + { + var sw = Stopwatch.StartNew(); + + _logger.LogInformation("MockLLM processing job {JobId} with model {Model}", request.JobId, request.Model); + + await Task.Delay(Random.Shared.Next(100, 600), cancellationToken); + + var keyword = _responses.Keys.FirstOrDefault(k => request.Prompt.Contains(k, StringComparison.OrdinalIgnoreCase)); + var content = keyword is not null + ? _responses[keyword] + : $"[MockLLM] Received: \"{request.Prompt}\". This is a deterministic mock response for local testing. Configure AI_PROVIDER=ollama or AI_PROVIDER=openai-compatible to use a real model."; + + sw.Stop(); + + return new AiResponse(request.JobId, content, request.Model, ProviderName, sw.ElapsedMilliseconds); + } +} diff --git a/DevOps-Project-41/app/src/AiProvider/OllamaProvider.cs b/DevOps-Project-41/app/src/AiProvider/OllamaProvider.cs new file mode 100644 index 00000000..991bf755 --- /dev/null +++ b/DevOps-Project-41/app/src/AiProvider/OllamaProvider.cs @@ -0,0 +1,37 @@ +using System.Diagnostics; +using System.Net.Http.Json; +using Microsoft.Extensions.Logging; + +namespace AiProvider; + +public class OllamaProvider : IAiProvider +{ + private readonly HttpClient _http; + private readonly ILogger _logger; + + public string ProviderName => "ollama"; + + public OllamaProvider(HttpClient http, ILogger logger) + { + _http = http; + _logger = logger; + } + + public async Task CompleteAsync(AiRequest request, CancellationToken cancellationToken = default) + { + var sw = Stopwatch.StartNew(); + + _logger.LogInformation("Ollama processing job {JobId} with model {Model}", request.JobId, request.Model); + + var payload = new { model = request.Model, prompt = request.Prompt, stream = false }; + var response = await _http.PostAsJsonAsync("/api/generate", payload, cancellationToken); + response.EnsureSuccessStatusCode(); + + var result = await response.Content.ReadFromJsonAsync(cancellationToken: cancellationToken); + sw.Stop(); + + return new AiResponse(request.JobId, result?.Response ?? string.Empty, request.Model, ProviderName, sw.ElapsedMilliseconds); + } + + private record OllamaResponse(string Response); +} diff --git a/DevOps-Project-41/app/src/AiProvider/OpenAiCompatibleProvider.cs b/DevOps-Project-41/app/src/AiProvider/OpenAiCompatibleProvider.cs new file mode 100644 index 00000000..65bf2727 --- /dev/null +++ b/DevOps-Project-41/app/src/AiProvider/OpenAiCompatibleProvider.cs @@ -0,0 +1,46 @@ +using System.Diagnostics; +using System.Net.Http.Json; +using Microsoft.Extensions.Logging; + +namespace AiProvider; + +public class OpenAiCompatibleProvider : IAiProvider +{ + private readonly HttpClient _http; + private readonly ILogger _logger; + + public string ProviderName => "openai-compatible"; + + public OpenAiCompatibleProvider(HttpClient http, ILogger logger) + { + _http = http; + _logger = logger; + } + + public async Task CompleteAsync(AiRequest request, CancellationToken cancellationToken = default) + { + var sw = Stopwatch.StartNew(); + + _logger.LogInformation("OpenAI-compatible processing job {JobId} with model {Model}", request.JobId, request.Model); + + var payload = new + { + model = request.Model, + messages = new[] { new { role = "user", content = request.Prompt } } + }; + + var response = await _http.PostAsJsonAsync("/chat/completions", payload, cancellationToken); + response.EnsureSuccessStatusCode(); + + var result = await response.Content.ReadFromJsonAsync(cancellationToken: cancellationToken); + var content = result?.Choices?.FirstOrDefault()?.Message?.Content ?? string.Empty; + + sw.Stop(); + + return new AiResponse(request.JobId, content, request.Model, ProviderName, sw.ElapsedMilliseconds); + } + + private record OpenAiResponse(OpenAiChoice[]? Choices); + private record OpenAiChoice(OpenAiMessage Message); + private record OpenAiMessage(string Content); +} diff --git a/DevOps-Project-41/app/src/AiWorker/AiWorker.csproj b/DevOps-Project-41/app/src/AiWorker/AiWorker.csproj new file mode 100644 index 00000000..979da42f --- /dev/null +++ b/DevOps-Project-41/app/src/AiWorker/AiWorker.csproj @@ -0,0 +1,25 @@ + + + + net8.0 + enable + enable + AiWorker + + + + + + + + + + + + + + + + + + diff --git a/DevOps-Project-41/app/src/AiWorker/JobUpdater.cs b/DevOps-Project-41/app/src/AiWorker/JobUpdater.cs new file mode 100644 index 00000000..817db931 --- /dev/null +++ b/DevOps-Project-41/app/src/AiWorker/JobUpdater.cs @@ -0,0 +1,54 @@ +using Npgsql; + +namespace AiWorker; + +public class JobUpdater +{ + private readonly string _connectionString; + + public JobUpdater(IConfiguration config) + { + _connectionString = config.GetConnectionString("Postgres") + ?? config["POSTGRES_CONNECTION_STRING"] + ?? "Host=localhost;Database=aiops;Username=aiops;Password=aiops"; + } + + public async Task MarkProcessingAsync(string jobId) + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + await using var cmd = conn.CreateCommand(); + cmd.CommandText = "UPDATE ai_jobs SET status = 'processing' WHERE job_id = @id"; + cmd.Parameters.AddWithValue("id", jobId); + await cmd.ExecuteNonQueryAsync(); + } + + public async Task MarkCompletedAsync(string jobId, string result, string provider, long durationMs) + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + await using var cmd = conn.CreateCommand(); + cmd.CommandText = """ + UPDATE ai_jobs + SET status = 'completed', result = @result, provider = @provider, + duration_ms = @duration, completed_at = NOW() + WHERE job_id = @id + """; + cmd.Parameters.AddWithValue("id", jobId); + cmd.Parameters.AddWithValue("result", result); + cmd.Parameters.AddWithValue("provider", provider); + cmd.Parameters.AddWithValue("duration", durationMs); + await cmd.ExecuteNonQueryAsync(); + } + + public async Task MarkFailedAsync(string jobId, string error) + { + await using var conn = new NpgsqlConnection(_connectionString); + await conn.OpenAsync(); + await using var cmd = conn.CreateCommand(); + cmd.CommandText = "UPDATE ai_jobs SET status = 'failed', error = @error, completed_at = NOW() WHERE job_id = @id"; + cmd.Parameters.AddWithValue("id", jobId); + cmd.Parameters.AddWithValue("error", error); + await cmd.ExecuteNonQueryAsync(); + } +} diff --git a/DevOps-Project-41/app/src/AiWorker/Program.cs b/DevOps-Project-41/app/src/AiWorker/Program.cs new file mode 100644 index 00000000..ddd30a30 --- /dev/null +++ b/DevOps-Project-41/app/src/AiWorker/Program.cs @@ -0,0 +1,67 @@ +using AiProvider; +using AiWorker; +using OpenTelemetry.Resources; +using OpenTelemetry.Trace; +using Prometheus; +using StackExchange.Redis; + +var builder = Host.CreateApplicationBuilder(args); + +// Redis +var redisConnection = builder.Configuration["REDIS_CONNECTION_STRING"] ?? "localhost:6379"; +builder.Services.AddSingleton(_ => ConnectionMultiplexer.Connect(redisConnection)); + +// Job updater +builder.Services.AddSingleton(); + +// AI Provider selection +var providerName = builder.Configuration["AI_PROVIDER"] ?? "mock"; +builder.Services.AddHttpClient(); + +builder.Services.AddSingleton(sp => +{ + var loggerFactory = sp.GetRequiredService(); + + return providerName switch + { + "ollama" => new OllamaProvider( + CreateHttpClient(sp, builder.Configuration["OLLAMA_BASE_URL"] ?? "http://localhost:11434"), + loggerFactory.CreateLogger()), + + "openai-compatible" => new OpenAiCompatibleProvider( + CreateHttpClient(sp, builder.Configuration["OPENAI_COMPATIBLE_BASE_URL"] ?? "http://localhost:8000", + builder.Configuration["OPENAI_API_KEY"]), + loggerFactory.CreateLogger()), + + _ => new MockLlmProvider(loggerFactory.CreateLogger()) + }; +}); + +// OpenTelemetry +var otelEndpoint = builder.Configuration["OTEL_EXPORTER_OTLP_ENDPOINT"]; +builder.Services.AddOpenTelemetry() + .ConfigureResource(r => r.AddService("ai-worker")) + .WithTracing(t => + { + t.AddHttpClientInstrumentation(); + if (!string.IsNullOrEmpty(otelEndpoint)) + t.AddOtlpExporter(o => o.Endpoint = new Uri(otelEndpoint)); + }); + +builder.Services.AddHostedService(); + +var host = builder.Build(); + +// Expose Prometheus metrics on port 9090 +var metricServer = new MetricServer(port: 9090); +metricServer.Start(); + +await host.RunAsync(); + +static HttpClient CreateHttpClient(IServiceProvider sp, string baseUrl, string? apiKey = null) +{ + var client = new HttpClient { BaseAddress = new Uri(baseUrl) }; + if (!string.IsNullOrEmpty(apiKey)) + client.DefaultRequestHeaders.Add("Authorization", $"Bearer {apiKey}"); + return client; +} diff --git a/DevOps-Project-41/app/src/AiWorker/Worker.cs b/DevOps-Project-41/app/src/AiWorker/Worker.cs new file mode 100644 index 00000000..cf4114ae --- /dev/null +++ b/DevOps-Project-41/app/src/AiWorker/Worker.cs @@ -0,0 +1,98 @@ +using System.Diagnostics; +using System.Text.Json; +using AiProvider; +using Prometheus; +using StackExchange.Redis; + +namespace AiWorker; + +public class Worker : BackgroundService +{ + private readonly ILogger _logger; + private readonly IConnectionMultiplexer _redis; + private readonly IAiProvider _aiProvider; + private readonly JobUpdater _jobUpdater; + private readonly ActivitySource _activitySource = new("ai-worker"); + + private static readonly Counter JobsCompleted = Metrics.CreateCounter("ai_jobs_completed_total", "Total jobs completed successfully"); + private static readonly Counter JobsFailed = Metrics.CreateCounter("ai_jobs_failed_total", "Total jobs failed"); + private static readonly Histogram JobDuration = Metrics.CreateHistogram("ai_job_duration_seconds", "AI job processing duration in seconds"); + private static readonly Gauge QueueDepth = Metrics.CreateGauge("ai_queue_depth", "Current Redis queue depth"); + + public Worker(ILogger logger, IConnectionMultiplexer redis, IAiProvider aiProvider, JobUpdater jobUpdater) + { + _logger = logger; + _redis = redis; + _aiProvider = aiProvider; + _jobUpdater = jobUpdater; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + _logger.LogInformation("AI Worker started. Provider: {Provider}", _aiProvider.ProviderName); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + var db = _redis.GetDatabase(); + var queueLen = await db.ListLengthAsync("ai-jobs"); + QueueDepth.Set(queueLen); + + var raw = await db.ListRightPopAsync("ai-jobs"); + if (raw.IsNullOrEmpty) + { + await Task.Delay(1000, stoppingToken); + continue; + } + + var job = JsonSerializer.Deserialize(raw!); + if (job is null) continue; + + await ProcessJobAsync(job, stoppingToken); + } + catch (OperationCanceledException) { break; } + catch (Exception ex) + { + _logger.LogError(ex, "Unexpected error in worker loop"); + await Task.Delay(2000, stoppingToken); + } + } + + _logger.LogInformation("AI Worker stopped"); + } + + private async Task ProcessJobAsync(JobMessage job, CancellationToken ct) + { + using var activity = _activitySource.StartActivity("worker.process.job"); + activity?.SetTag("job.id", job.JobId); + activity?.SetTag("job.model", job.Model); + + _logger.LogInformation("Processing job {JobId}", job.JobId); + + await _jobUpdater.MarkProcessingAsync(job.JobId); + + using var timer = JobDuration.NewTimer(); + try + { + var response = await _aiProvider.CompleteAsync( + new AiRequest(job.Prompt, job.Model, job.JobId), ct); + + await _jobUpdater.MarkCompletedAsync(job.JobId, response.Content, response.Provider, response.DurationMs); + JobsCompleted.Inc(); + + activity?.SetTag("job.provider", response.Provider); + activity?.SetTag("job.duration_ms", response.DurationMs); + _logger.LogInformation("Job {JobId} completed in {DurationMs}ms", job.JobId, response.DurationMs); + } + catch (Exception ex) + { + await _jobUpdater.MarkFailedAsync(job.JobId, ex.Message); + JobsFailed.Inc(); + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + _logger.LogError(ex, "Job {JobId} failed", job.JobId); + } + } + + private record JobMessage(string JobId, string Prompt, string Model); +} diff --git a/DevOps-Project-41/app/src/AiWorker/appsettings.json b/DevOps-Project-41/app/src/AiWorker/appsettings.json new file mode 100644 index 00000000..9ff1f7f1 --- /dev/null +++ b/DevOps-Project-41/app/src/AiWorker/appsettings.json @@ -0,0 +1,13 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.Hosting.Lifetime": "Information" + } + }, + "ConnectionStrings": { + "Postgres": "Host=localhost;Database=aiops;Username=aiops;Password=aiops" + }, + "REDIS_CONNECTION_STRING": "localhost:6379", + "AI_PROVIDER": "mock" +} diff --git a/DevOps-Project-41/app/tests/AiApi.Tests/AiApi.Tests.csproj b/DevOps-Project-41/app/tests/AiApi.Tests/AiApi.Tests.csproj new file mode 100644 index 00000000..e5581b9e --- /dev/null +++ b/DevOps-Project-41/app/tests/AiApi.Tests/AiApi.Tests.csproj @@ -0,0 +1,24 @@ + + + + net8.0 + enable + enable + false + + + + + + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + diff --git a/DevOps-Project-41/app/tests/AiApi.Tests/MockLlmProviderTests.cs b/DevOps-Project-41/app/tests/AiApi.Tests/MockLlmProviderTests.cs new file mode 100644 index 00000000..803f2016 --- /dev/null +++ b/DevOps-Project-41/app/tests/AiApi.Tests/MockLlmProviderTests.cs @@ -0,0 +1,59 @@ +using AiProvider; +using Microsoft.Extensions.Logging.Abstractions; +using Xunit; + +namespace AiApi.Tests; + +public class MockLlmProviderTests +{ + private readonly MockLlmProvider _provider = new(NullLogger.Instance); + + [Fact] + public async Task CompleteAsync_ReturnsResponse_ForKnownKeyword() + { + var request = new AiRequest("Explain GitOps in simple terms", "mock-devops-model", "job-001"); + + var response = await _provider.CompleteAsync(request); + + Assert.Equal("job-001", response.JobId); + Assert.NotEmpty(response.Content); + Assert.Contains("GitOps", response.Content, StringComparison.OrdinalIgnoreCase); + Assert.Equal("mock", response.Provider); + } + + [Fact] + public async Task CompleteAsync_ReturnsFallback_ForUnknownPrompt() + { + var request = new AiRequest("Tell me about the weather", "mock-devops-model", "job-002"); + + var response = await _provider.CompleteAsync(request); + + Assert.Equal("job-002", response.JobId); + Assert.Contains("[MockLLM]", response.Content); + } + + [Fact] + public async Task CompleteAsync_ReturnsPositiveDuration() + { + var request = new AiRequest("Kubernetes overview", "mock-devops-model", "job-003"); + + var response = await _provider.CompleteAsync(request); + + Assert.True(response.DurationMs >= 0); + } + + [Theory] + [InlineData("kubernetes")] + [InlineData("devsecops")] + [InlineData("keda")] + [InlineData("opentelemetry")] + public async Task CompleteAsync_RecognisesAllKeywords(string keyword) + { + var request = new AiRequest($"What is {keyword}?", "mock-devops-model", $"job-{keyword}"); + + var response = await _provider.CompleteAsync(request); + + Assert.NotEmpty(response.Content); + Assert.DoesNotContain("[MockLLM]", response.Content); + } +} diff --git a/DevOps-Project-41/app/tests/AiWorker.Tests/AiProviderContractTests.cs b/DevOps-Project-41/app/tests/AiWorker.Tests/AiProviderContractTests.cs new file mode 100644 index 00000000..a95f2598 --- /dev/null +++ b/DevOps-Project-41/app/tests/AiWorker.Tests/AiProviderContractTests.cs @@ -0,0 +1,34 @@ +using AiProvider; +using Microsoft.Extensions.Logging.Abstractions; +using Xunit; + +namespace AiWorker.Tests; + +public class AiProviderContractTests +{ + [Fact] + public async Task MockProvider_AlwaysReturnsNonEmptyContent() + { + var provider = new MockLlmProvider(NullLogger.Instance); + var request = new AiRequest("How does KEDA work?", "mock-devops-model", "contract-test-001"); + + var response = await provider.CompleteAsync(request); + + Assert.NotNull(response); + Assert.NotEmpty(response.Content); + Assert.Equal("contract-test-001", response.JobId); + Assert.Equal("mock", response.Provider); + Assert.True(response.DurationMs >= 0); + } + + [Fact] + public async Task MockProvider_CancellationToken_IsRespected() + { + var provider = new MockLlmProvider(NullLogger.Instance); + var cts = new CancellationTokenSource(); + cts.Cancel(); + + await Assert.ThrowsAnyAsync( + () => provider.CompleteAsync(new AiRequest("test", "model", "job-cancel"), cts.Token)); + } +} diff --git a/DevOps-Project-41/app/tests/AiWorker.Tests/AiWorker.Tests.csproj b/DevOps-Project-41/app/tests/AiWorker.Tests/AiWorker.Tests.csproj new file mode 100644 index 00000000..e5581b9e --- /dev/null +++ b/DevOps-Project-41/app/tests/AiWorker.Tests/AiWorker.Tests.csproj @@ -0,0 +1,24 @@ + + + + net8.0 + enable + enable + false + + + + + + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + diff --git a/DevOps-Project-41/architecture/architecture.md b/DevOps-Project-41/architecture/architecture.md new file mode 100644 index 00000000..18a1dd52 --- /dev/null +++ b/DevOps-Project-41/architecture/architecture.md @@ -0,0 +1,106 @@ +# Architecture — AI-Native DevSecOps Platform + +## System overview + +This platform is designed around three axes: + +1. **Developer workflow** — git push triggers automated CI/CD with security gates +2. **Runtime platform** — Kubernetes with GitOps, event-driven autoscaling and observability +3. **Security posture** — supply chain security from code to deployment + +## Components + +### Application layer + +| Component | Type | Responsibility | +|-----------|------|---------------| +| `ai-api` | .NET 8 Minimal API | Accept HTTP requests, validate, enqueue to Redis, return job status | +| `ai-worker` | .NET 8 Worker Service | Dequeue jobs, invoke AI provider, persist results, emit telemetry | +| `AiProvider` | .NET 8 Class Library | Abstraction over MockLLM, Ollama, and OpenAI-compatible providers | + +### Infrastructure layer + +| Component | Purpose | +|-----------|---------| +| Redis 7.2 | FIFO job queue using Redis List (`ai-jobs`) | +| PostgreSQL 16 | Persistent job state (`ai_jobs` table) | +| kind (local) | Local Kubernetes cluster with 1 control-plane + 2 workers | + +### CI/CD layer + +| Workflow | Trigger | Key jobs | +|----------|---------|----------| +| `ci.yml` | push / PR | restore → build → test → docker-build | +| `security.yml` | push / weekly schedule | trivy-fs → trivy-config → trivy-image → upload-sarif | +| `release.yml` | version tag / manual | build+push GHCR → sign → SBOM → verify | + +### GitOps layer + +| Component | Configuration | +|-----------|--------------| +| Argo CD | Watches `k8s/overlays/dev`, auto-syncs on digest change | +| Kustomize | `k8s/base` + `k8s/overlays/{dev,prod}` for environment-specific config | +| KEDA | `ScaledObject` on Redis list `ai-jobs` — scales worker 0→10 replicas | + +### Observability layer + +``` +Application SDK (OpenTelemetry) + → OpenTelemetry Collector (OTLP gRPC) + → Prometheus exporter (:8889) + → Logging exporter + → Tempo/Loki (optional) +Prometheus scrapes OTel Collector +Grafana queries Prometheus + Loki +``` + +### Security layer + +| Control | Where applied | +|---------|--------------| +| Trivy scan | GitHub Actions + local CLI | +| SBOM generation | GitHub Actions release pipeline | +| Cosign keyless signing | GitHub Actions OIDC → Sigstore Rekor | +| Kyverno policies | Kubernetes admission webhook | +| Non-root containers | All Kubernetes workloads | +| Network policies | Restrict pod-to-pod communication | +| Secret separation | Kubernetes Secrets (dev) / ESO (prod) | + +## Data flow — job lifecycle + +``` +POST /ask + → API validates request + → API inserts row in PostgreSQL (status=queued) + → API pushes JSON payload to Redis list ai-jobs + → API returns 202 Accepted with jobId + +KEDA (every 10s) + → reads Redis list length + → adjusts HPA target replicas (0–10) + +Worker (continuous) + → pops job from Redis (blocking) + → updates PostgreSQL (status=processing) + → calls AI provider + → updates PostgreSQL (status=completed/failed, result, duration_ms) + → emits OTel span + metrics + +GET /jobs/{jobId} + → API reads from PostgreSQL + → returns current status + result +``` + +## Overlay strategy + +``` +k8s/base/ → all resources at default scale +k8s/overlays/dev/ → small resources, mock AI, imagePullPolicy Always +k8s/overlays/prod/ → 2 replicas, full resource limits, manual Argo CD sync +``` + +## Security boundary + +Kyverno enforces at admission time so no non-compliant Pod can ever be scheduled, regardless of who applies the manifest. + +Network policies restrict lateral movement between Pods. The `ai-api` and `ai-worker` can only talk to Redis, PostgreSQL, and the OTel Collector — not to each other directly. diff --git a/DevOps-Project-41/architecture/diagrams/architecture.mmd b/DevOps-Project-41/architecture/diagrams/architecture.mmd new file mode 100644 index 00000000..fc057db0 --- /dev/null +++ b/DevOps-Project-41/architecture/diagrams/architecture.mmd @@ -0,0 +1,56 @@ +flowchart TD + DEV[Developer] -->|git push| GH[GitHub Repository] + + subgraph CI_CD [CI/CD — GitHub Actions] + CI[ci.yml\nbuild + test] + SEC[security.yml\nTrivy scan + SARIF] + REL[release.yml\nGHCR push + SBOM + Cosign] + end + + GH --> CI + GH --> SEC + GH --> REL + REL --> GHCR[GitHub Container Registry\nghcr.io/owner/ai-api\nghcr.io/owner/ai-worker] + REL -->|update image digest| MANIFESTS[k8s/overlays/dev\nkustomization.yaml] + + MANIFESTS --> ARGO[Argo CD\nautomated reconciliation] + + subgraph K8S [Kubernetes Cluster — ai-devsecops namespace] + direction TB + API[ai-api\n.NET 8 Minimal API\nPORT 8080] + WORKER[ai-worker\n.NET 8 Worker\nPORT 9090 metrics] + REDIS[(Redis 7.2\nJob Queue)] + PG[(PostgreSQL 16\nJob Results)] + KEDA_OBJ[KEDA ScaledObject\nmin=0 max=10] + OTEL[OTel Collector\n:4317 gRPC :4318 HTTP] + PROM[Prometheus\n:9090] + GRAF[Grafana\n:3000] + LOKI[Loki\n:3100] + end + + ARGO --> K8S + + API -->|1. enqueue job| REDIS + API -->|2. insert row| PG + KEDA_OBJ -->|scale replicas| WORKER + REDIS -->|3. dequeue job| WORKER + WORKER -->|4. call provider| AIPROV[AI Provider\nmock / Ollama /\nOpenAI-compatible] + WORKER -->|5. update result| PG + API -->|OTLP traces + metrics| OTEL + WORKER -->|OTLP traces + metrics| OTEL + OTEL -->|scrape| PROM + OTEL --> LOKI + PROM --> GRAF + LOKI --> GRAF + + subgraph SECURITY [Supply Chain Security] + TRIVY[Trivy\nfs + image + config] + SBOM_BOX[SBOM\nSPDX + CycloneDX] + COSIGN[Cosign\nkeyless signing] + KYVERNO[Kyverno\nadmission policies] + end + + REL --> TRIVY + REL --> SBOM_BOX + REL --> COSIGN + KYVERNO -.->|admission webhook| K8S diff --git a/DevOps-Project-41/architecture/diagrams/diagrams-1.png b/DevOps-Project-41/architecture/diagrams/diagrams-1.png new file mode 100644 index 00000000..d26e9080 Binary files /dev/null and b/DevOps-Project-41/architecture/diagrams/diagrams-1.png differ diff --git a/DevOps-Project-41/architecture/diagrams/diagrams-2.png b/DevOps-Project-41/architecture/diagrams/diagrams-2.png new file mode 100644 index 00000000..0650ec8c Binary files /dev/null and b/DevOps-Project-41/architecture/diagrams/diagrams-2.png differ diff --git a/DevOps-Project-41/docs/cleanup.md b/DevOps-Project-41/docs/cleanup.md new file mode 100644 index 00000000..211fb93d --- /dev/null +++ b/DevOps-Project-41/docs/cleanup.md @@ -0,0 +1,62 @@ +# Cleanup Instructions + +## Docker Compose + +```bash +cd DevOps-Project-41/app + +# Stop all containers and remove volumes +docker compose down -v + +# Remove local images (optional) +docker rmi ai-native-devsecops/ai-api:local ai-native-devsecops/ai-worker:local 2>/dev/null || true +``` + +## Kubernetes + +```bash +# Remove Argo CD applications (triggers pruning of managed resources) +kubectl delete -f DevOps-Project-41/gitops/argocd-app-dev.yaml --ignore-not-found=true +kubectl delete -f DevOps-Project-41/gitops/argocd-app-prod.yaml --ignore-not-found=true + +# Remove application namespace +kubectl delete namespace ai-devsecops --ignore-not-found=true + +# Remove Argo CD +kubectl delete namespace argocd --ignore-not-found=true + +# Remove KEDA +helm uninstall keda -n keda 2>/dev/null || true +kubectl delete namespace keda --ignore-not-found=true + +# Remove Prometheus + Grafana + Loki +helm uninstall kube-prometheus-stack -n monitoring 2>/dev/null || true +helm uninstall loki -n monitoring 2>/dev/null || true +kubectl delete namespace monitoring --ignore-not-found=true + +# Remove Kyverno +helm uninstall kyverno -n kyverno 2>/dev/null || true +kubectl delete namespace kyverno --ignore-not-found=true +kubectl delete clusterpolicies --all 2>/dev/null || true +``` + +## kind cluster + +```bash +kind delete cluster --name ai-devsecops +``` + +## GHCR images (optional) + +Delete container images via GitHub UI: +- Go to your GitHub profile → Packages +- Select `ai-api` or `ai-worker` +- Delete specific versions or the entire package + +## Local build artefacts + +```bash +cd DevOps-Project-41/app +find . -name "bin" -o -name "obj" | xargs rm -rf +rm -f sbom-*.json trivy-*.sarif TestResults/*.trx +``` diff --git a/DevOps-Project-41/docs/gitops.md b/DevOps-Project-41/docs/gitops.md new file mode 100644 index 00000000..76ecfd98 --- /dev/null +++ b/DevOps-Project-41/docs/gitops.md @@ -0,0 +1,78 @@ +# GitOps Workflow Guide + +## How GitOps works in this project + +``` +Git commit + → GitHub Actions CI (build + test + scan) + → Release pipeline (push to GHCR + sign) + → Pipeline updates image digest in k8s/overlays/dev/kustomization.yaml + → Argo CD detects change in Git + → Argo CD syncs desired state to Kubernetes cluster +``` + +Git is the single source of truth. No `kubectl apply` is run manually in production — all changes go through Git. + +## Install Argo CD + +```bash +kubectl create namespace argocd +kubectl apply -n argocd --server-side --force-conflicts \ + -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml +kubectl -n argocd wait --for=condition=Available deployment/argocd-server --timeout=120s +``` + +## Access the UI + +```bash +# Get initial admin password +kubectl -n argocd get secret argocd-initial-admin-secret \ + -o jsonpath="{.data.password}" | base64 -d && echo + +# Port-forward +kubectl -n argocd port-forward svc/argocd-server 8080:443 +# Open: https://localhost:8080 (accept self-signed cert) +``` + +## Apply the dev Application + +Edit `gitops/argocd-app-dev.yaml` and replace `GITHUB_OWNER` with your GitHub username, then: + +```bash +kubectl apply -f DevOps-Project-41/gitops/argocd-app-dev.yaml +kubectl -n argocd get applications +kubectl -n argocd get application ai-native-platform-dev -o yaml +``` + +## Sync behaviour + +| Environment | Sync mode | Prune | Self-heal | +|-------------|-----------|-------|-----------| +| dev | Automated | Yes | Yes | +| prod | Manual | Yes | No | + +## Promote to prod + +1. Merge changes to `master` branch +2. Update image tag in `k8s/overlays/prod/kustomization.yaml` +3. Apply the prod Application: `kubectl apply -f gitops/argocd-app-prod.yaml` +4. In Argo CD UI, click **Sync** → **Synchronize** + +## Using the Argo CD CLI + +```bash +# Install +brew install argocd + +# Login +argocd login localhost:8080 --insecure --username admin + +# List apps +argocd app list + +# Sync manually +argocd app sync ai-native-platform-dev + +# Watch status +argocd app wait ai-native-platform-dev --health +``` diff --git a/DevOps-Project-41/docs/observability.md b/DevOps-Project-41/docs/observability.md new file mode 100644 index 00000000..813ce840 --- /dev/null +++ b/DevOps-Project-41/docs/observability.md @@ -0,0 +1,86 @@ +# Observability Guide + +## Stack overview + +``` +Application (ai-api, ai-worker) + → OpenTelemetry SDK + → OpenTelemetry Collector (OTLP gRPC :4317) + → Prometheus (metrics scrape :8889) + → Loki (logs via Promtail) + → Grafana (visualisation) +``` + +## Signals + +### Traces + +| Span name | Service | Description | +|-----------|---------|-------------| +| `http.post.ask` | ai-api | Incoming /ask request | +| `queue.enqueue.redis` | ai-api | Job enqueue to Redis | +| `worker.process.job` | ai-worker | Full job processing | +| `ai.provider.call` | ai-worker | AI provider invocation | + +### Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `ai_jobs_created_total` | Counter | Total jobs created via /ask | +| `ai_jobs_completed_total` | Counter | Total successfully completed jobs | +| `ai_jobs_failed_total` | Counter | Total failed jobs | +| `ai_job_duration_seconds` | Histogram | Job processing time | +| `ai_queue_depth` | Gauge | Current Redis list length | +| `ai_jobs_enqueue_failed_total` | Counter | Failed enqueue attempts | + +### Logs + +Logs are structured JSON with `traceId` correlation when an active trace exists. + +## Install Prometheus + Grafana + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace monitoring --create-namespace \ + -f DevOps-Project-41/observability/prometheus-values.yaml + +kubectl -n monitoring get pods +kubectl -n monitoring port-forward svc/kube-prometheus-stack-grafana 3000:80 +``` + +## Install Loki + Promtail + +```bash +helm repo add grafana https://grafana.github.io/helm-charts +helm install loki grafana/loki-stack \ + --namespace monitoring \ + -f DevOps-Project-41/observability/loki-values.yaml +``` + +## Import Grafana dashboard + +1. Open `http://localhost:3000` (admin/admin) +2. **Dashboards → Import → Upload JSON file** +3. Select `observability/grafana-dashboard.json` + +## Deploy OpenTelemetry Collector on Kubernetes + +```bash +kubectl apply -f DevOps-Project-41/observability/otel-collector.yaml +kubectl -n ai-devsecops get pods -l app.kubernetes.io/name=otel-collector +``` + +## Verify metrics are flowing + +```bash +# Check Prometheus targets +open http://localhost:9091/targets + +# Query a metric directly +curl -s http://localhost:9091/api/v1/query?query=ai_jobs_created_total | jq . + +# Check OTel collector is receiving +kubectl -n ai-devsecops logs -l app.kubernetes.io/name=otel-collector --tail=20 +``` diff --git a/DevOps-Project-41/docs/security.md b/DevOps-Project-41/docs/security.md new file mode 100644 index 00000000..25f2d263 --- /dev/null +++ b/DevOps-Project-41/docs/security.md @@ -0,0 +1,89 @@ +# Security Guide + +## Defence-in-depth layers + +| Layer | Tool | What it protects | +|-------|------|-----------------| +| Code scanning | Trivy (fs) | Vulnerabilities in source dependencies | +| Image scanning | Trivy (image) | CVEs in base image and OS packages | +| Config scanning | Trivy (config) | Kubernetes manifest misconfigurations | +| SBOM | Trivy / Syft | Component inventory + licence compliance | +| Image signing | Cosign (keyless) | Tamper-evident provenance via Sigstore | +| Admission control | Kyverno | Runtime enforcement of security policies | +| Secret management | Kubernetes Secrets (dev) / External Secrets (prod) | Credential isolation | + +## Trivy + +Trivy is configured in `security/trivy.yaml` to report `HIGH` and `CRITICAL` findings. + +```bash +# Source code scan +trivy fs DevOps-Project-41 --config DevOps-Project-41/security/trivy.yaml + +# Kubernetes manifests +trivy config DevOps-Project-41/k8s + +# Container image +trivy image ghcr.io/GITHUB_OWNER/ai-api:1.0.0 + +# Generate JSON report +trivy image --format json --output report.json ghcr.io/GITHUB_OWNER/ai-api:1.0.0 +``` + +## SBOM + +See [security/sbom.md](../security/sbom.md) for full SBOM generation and inspection instructions. + +## Cosign image signing + +See [security/cosign.md](../security/cosign.md) for signing and verification instructions. + +## Kyverno policies + +Policies are in `security/policies/`. They are enforced on the `ai-devsecops` namespace. + +| Policy file | Rule | Action | +|-------------|------|--------| +| `deny-privileged-containers.yaml` | No `privileged: true` containers | Enforce | +| `require-non-root.yaml` | `runAsNonRoot: true` required | Enforce | +| `require-resource-limits.yaml` | CPU + memory requests and limits required | Enforce | +| `restrict-latest-tag.yaml` | No `:latest` image tag | Enforce | +| `require-labels.yaml` | `app.kubernetes.io/name` + `version` labels required | Enforce | + +### Install Kyverno and apply policies + +```bash +helm repo add kyverno https://kyverno.github.io/kyverno +helm install kyverno kyverno/kyverno --namespace kyverno --create-namespace +kubectl -n kyverno get pods + +kubectl apply -f DevOps-Project-41/security/policies/ +kubectl get clusterpolicies +``` + +### Test a policy rejection + +```bash +# This should be REJECTED (latest tag) +kubectl -n ai-devsecops run test --image=nginx:latest --dry-run=server +# Error from server: admission webhook denied the request: Image tag 'latest' is not allowed. + +# This should be ACCEPTED (explicit tag) +kubectl -n ai-devsecops run test --image=nginx:1.27 --dry-run=server +``` + +## Container security baseline + +All workloads in this project follow the Kubernetes Restricted pod security standard: + +- `runAsNonRoot: true` +- `allowPrivilegeEscalation: false` +- `capabilities.drop: [ALL]` +- `readOnlyRootFilesystem: true` (API and worker) +- `automountServiceAccountToken: false` + +## Secret handling + +- Development: Kubernetes Secrets via `kubectl create secret` or `secretGenerator` in Kustomize +- Production: Use [External Secrets Operator](https://external-secrets.io/) with AWS Secrets Manager, Azure Key Vault, or GCP Secret Manager +- Never commit `secret.yaml` — only `secret.example.yaml` is tracked in Git diff --git a/DevOps-Project-41/docs/setup-cloud.md b/DevOps-Project-41/docs/setup-cloud.md new file mode 100644 index 00000000..4059fd60 --- /dev/null +++ b/DevOps-Project-41/docs/setup-cloud.md @@ -0,0 +1,114 @@ +# Cloud Setup Guide (Optional) + +> This guide describes deploying the platform to a managed Kubernetes service. All core functionality works locally with kind — cloud is optional. + +## Supported targets + +- **Azure AKS** — recommended for .NET workloads +- **AWS EKS** +- **GCP GKE** + +## Prerequisites + +```bash +# Azure +brew install azure-cli +az login +az aks install-cli + +# AWS +brew install awscli eksctl +aws configure + +# GCP +brew install google-cloud-sdk +gcloud auth login +gcloud components install gke-gcloud-auth-plugin +``` + +## AKS (Azure) + +```bash +# Create resource group and cluster +az group create --name rg-ai-devsecops --location westeurope +az aks create \ + --resource-group rg-ai-devsecops \ + --name aks-ai-devsecops \ + --node-count 2 \ + --node-vm-size Standard_DS2_v2 \ + --generate-ssh-keys + +# Get credentials +az aks get-credentials --resource-group rg-ai-devsecops --name aks-ai-devsecops +kubectl get nodes +``` + +## EKS (AWS) + +```bash +eksctl create cluster \ + --name ai-devsecops \ + --region eu-west-1 \ + --nodegroup-name workers \ + --node-type t3.medium \ + --nodes 2 \ + --with-oidc + +kubectl get nodes +``` + +## GKE (GCP) + +```bash +gcloud container clusters create ai-devsecops \ + --zone europe-west1-b \ + --machine-type e2-standard-2 \ + --num-nodes 2 \ + --workload-pool=$(gcloud config get-value project).svc.id.goog + +gcloud container clusters get-credentials ai-devsecops --zone europe-west1-b +kubectl get nodes +``` + +## Deploy the platform + +Once you have a cluster configured, follow the same steps as the local Kubernetes setup: + +```bash +# Create namespace and postgres secret +kubectl create namespace ai-devsecops +kubectl -n ai-devsecops create secret generic postgres-secret \ + --from-literal=password= + +# Install KEDA +helm install keda kedacore/keda --namespace keda --create-namespace + +# Install observability stack +helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --namespace monitoring --create-namespace \ + -f DevOps-Project-41/observability/prometheus-values.yaml + +# Install Argo CD and apply the Application +kubectl create namespace argocd +kubectl apply -n argocd --server-side --force-conflicts \ + -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml +kubectl apply -f DevOps-Project-41/gitops/argocd-app-dev.yaml +``` + +## Terraform (future) + +The `infra/terraform/` directory is reserved for Terraform modules to provision the cloud infrastructure above. Contributions welcome. + +## Cleanup (cloud) + +```bash +# AKS +az aks delete --resource-group rg-ai-devsecops --name aks-ai-devsecops --yes +az group delete --name rg-ai-devsecops --yes + +# EKS +eksctl delete cluster --name ai-devsecops --region eu-west-1 + +# GKE +gcloud container clusters delete ai-devsecops --zone europe-west1-b +``` diff --git a/DevOps-Project-41/docs/setup-local.md b/DevOps-Project-41/docs/setup-local.md new file mode 100644 index 00000000..5f45f482 --- /dev/null +++ b/DevOps-Project-41/docs/setup-local.md @@ -0,0 +1,90 @@ +# Local Setup Guide + +## Prerequisites + +Install the following tools before starting: + +```bash +# macOS (Homebrew) +brew install kind kubectl helm dotnet k6 trivy cosign + +# Verify +docker --version # ≥ 24 +kind --version # ≥ 0.23 +kubectl version --client # ≥ 1.29 +helm version # ≥ 3.14 +dotnet --version # 8.0.x +``` + +## Option 1 — Docker Compose (fastest) + +```bash +cd DevOps-Project-41/app +docker compose up --build + +# Verify +curl http://localhost:8080/health +# Grafana: http://localhost:3000 (admin/admin) +# Prometheus: http://localhost:9091 +``` + +## Option 2 — Local Kubernetes with kind + +### Step 1: Create the cluster + +```bash +kind create cluster --config DevOps-Project-41/infra/kind/kind-cluster.yaml +kubectl cluster-info +``` + +### Step 2: Build and load images into kind + +```bash +cd DevOps-Project-41/app +docker build --target api -t ai-api:dev . +docker build --target worker -t ai-worker:dev . + +kind load docker-image ai-api:dev --name ai-devsecops +kind load docker-image ai-worker:dev --name ai-devsecops +``` + +### Step 3: Create the postgres secret + +```bash +kubectl create namespace ai-devsecops +kubectl -n ai-devsecops create secret generic postgres-secret \ + --from-literal=password=aiops-dev-password +``` + +### Step 4: Deploy with Kustomize + +```bash +# Update images in overlays/dev/kustomization.yaml to ai-api:dev and ai-worker:dev +kubectl apply -k DevOps-Project-41/k8s/overlays/dev +kubectl -n ai-devsecops get pods -w +``` + +### Step 5: Port-forward and test + +```bash +kubectl -n ai-devsecops port-forward svc/ai-api 8080:80 & +curl http://localhost:8080/health +bash DevOps-Project-41/tests/smoke/smoke-test.sh +``` + +## AI Provider Configuration + +| Mode | `AI_PROVIDER` value | Extra env var required | +|------|--------------------|-----------------------| +| Mock (default) | `mock` | None | +| Ollama | `ollama` | `OLLAMA_BASE_URL=http://ollama:11434` | +| OpenAI-compatible | `openai-compatible` | `OPENAI_COMPATIBLE_BASE_URL`, `OPENAI_API_KEY` | + +To switch provider in Docker Compose, edit `docker-compose.yml` and change `AI_PROVIDER`. + +## Running tests + +```bash +cd DevOps-Project-41/app +dotnet test -c Release +``` diff --git a/DevOps-Project-41/docs/troubleshooting.md b/DevOps-Project-41/docs/troubleshooting.md new file mode 100644 index 00000000..e86874b1 --- /dev/null +++ b/DevOps-Project-41/docs/troubleshooting.md @@ -0,0 +1,196 @@ +# Troubleshooting Guide + +## Docker Compose + +### `docker compose up` fails immediately + +**Symptom:** Port binding error. + +``` +Error response from daemon: Ports are not available: exposing port TCP 0.0.0.0:8080 +``` + +**Fix:** Find and stop the process using the port. + +```bash +lsof -i :8080 +kill -9 +``` + +### API returns 503 on `/ready` + +**Symptom:** Redis or PostgreSQL not yet ready. + +**Fix:** Wait 15–30 seconds for healthcheck retries. Check individual service logs: + +```bash +docker compose logs redis +docker compose logs postgres +``` + +### Worker not processing jobs + +**Symptom:** Jobs stay in `queued` state indefinitely. + +**Fix:** Check worker logs and confirm `REDIS_CONNECTION_STRING` is reachable. + +```bash +docker compose logs ai-worker +docker compose exec redis redis-cli llen ai-jobs +``` + +--- + +## Kubernetes (kind) + +### Pods in `ImagePullBackOff` + +**Cause:** Image not present in kind cluster. + +**Fix:** Load image manually. + +```bash +kind load docker-image ai-api:dev --name ai-devsecops +kind load docker-image ai-worker:dev --name ai-devsecops +``` + +### Pods stuck in `Pending` + +**Cause:** Insufficient resources or PVC not bound. + +**Fix:** + +```bash +kubectl -n ai-devsecops describe pod +kubectl -n ai-devsecops get pvc +``` + +For PVC issues on kind, ensure the default StorageClass is available: + +```bash +kubectl get storageclass +``` + +### `kubectl apply -k` fails with version errors + +**Cause:** Old kubectl version. + +**Fix:** Update to kubectl ≥ 1.29: + +```bash +brew upgrade kubectl +``` + +--- + +## Argo CD + +### Argo CD Application stuck in `OutOfSync` + +**Fix:** Check the source path and branch: + +```bash +kubectl -n argocd describe application ai-native-platform-dev +argocd app diff ai-native-platform-dev +``` + +### Argo CD cannot reach the Git repository + +**Fix:** Ensure the `repoURL` in `gitops/argocd-app-dev.yaml` is a public repository or add SSH credentials: + +```bash +argocd repo add https://github.com/GITHUB_OWNER/DevOps-Projects.git +``` + +--- + +## KEDA + +### Worker not scaling despite jobs in queue + +**Step 1:** Verify KEDA operator is running. + +```bash +kubectl -n keda get pods +``` + +**Step 2:** Check the ScaledObject status. + +```bash +kubectl -n ai-devsecops describe scaledobject ai-worker-scaledobject +``` + +**Step 3:** Verify Redis address matches the Service name. + +The `address` in `keda-scaledobject-worker.yaml` must match the Redis Service FQDN: + +``` +redis.ai-devsecops.svc.cluster.local:6379 +``` + +**Step 4:** Check KEDA operator logs. + +```bash +kubectl -n keda logs -l app=keda-operator --tail=30 +``` + +--- + +## Cosign + +### `cosign verify` fails with identity mismatch + +**Fix:** Ensure `--certificate-identity-regexp` matches your exact repository path: + +```bash +--certificate-identity-regexp "https://github.com/YOUR_OWNER/DevOps-Projects/.github/workflows/release.yml.*" +``` + +### `cosign sign` fails with OIDC error in GitHub Actions + +**Fix:** The workflow must have `id-token: write` permission at the job level. Check `release.yml`: + +```yaml +permissions: + id-token: write + packages: write +``` + +--- + +## GitHub Actions CI + +### CI fails on `dotnet restore` + +**Fix:** Ensure .NET 8 SDK is specified correctly in `ci.yml`: + +```yaml +dotnet-version: "8.0.x" +``` + +### Docker build cache miss on every run + +**Fix:** Ensure `cache-from` and `cache-to` are set correctly: + +```yaml +cache-from: type=gha +cache-to: type=gha,mode=max +``` + +--- + +## PostgreSQL + +### `NpgsqlException: connection refused` + +**Fix:** Check the connection string format. For Docker Compose: + +``` +Host=postgres;Database=aiops;Username=aiops;Password=aiops +``` + +For Kubernetes (StatefulSet headless service): + +``` +Host=postgres.ai-devsecops.svc.cluster.local;Database=aiops;Username=aiops;Password= +``` diff --git a/DevOps-Project-41/docs/validation-checklist.md b/DevOps-Project-41/docs/validation-checklist.md new file mode 100644 index 00000000..a2722294 --- /dev/null +++ b/DevOps-Project-41/docs/validation-checklist.md @@ -0,0 +1,147 @@ +# Platform Validation Checklist + +Use this checklist to verify each component after deployment. + +## 1. Local Development (Docker Compose) + +- [ ] `docker compose up --build` completes without errors +- [ ] `curl http://localhost:8080/health` → `{"status":"healthy"}` +- [ ] `POST /ask` returns `jobId` with `status: queued` +- [ ] `GET /jobs/{jobId}` eventually returns `status: completed` with a `result` +- [ ] `bash tests/smoke/smoke-test.sh` — all tests pass +- [ ] Grafana accessible at `http://localhost:3000` +- [ ] Prometheus accessible at `http://localhost:9091` + +## 2. .NET Tests + +```bash +cd DevOps-Project-41/app +dotnet test -c Release +``` + +- [ ] All unit tests pass +- [ ] MockLlmProvider returns expected responses for known keywords +- [ ] CancellationToken is respected + +## 3. Kubernetes Deployment + +```bash +kubectl -n ai-devsecops get pods +``` + +- [ ] `ai-api` pod is `Running` and `Ready` +- [ ] `ai-worker` pod is `Running` and `Ready` +- [ ] `redis` pod is `Running` and `Ready` +- [ ] `postgres-0` StatefulSet pod is `Running` and `Ready` +- [ ] `otel-collector` pod is `Running` and `Ready` + +```bash +kubectl -n ai-devsecops port-forward svc/ai-api 8080:80 +curl http://localhost:8080/health +``` + +- [ ] API responds inside cluster + +## 4. GitOps — Argo CD + +```bash +kubectl -n argocd get applications +``` + +- [ ] `ai-native-platform-dev` shows `Synced` and `Healthy` +- [ ] Changing a manifest in Git triggers automatic reconciliation within 3 minutes + +## 5. KEDA Autoscaling + +```bash +kubectl -n ai-devsecops get scaledobject +kubectl -n ai-devsecops get hpa +``` + +- [ ] ScaledObject `READY=True` +- [ ] Worker replica count is 0 when queue is empty +- [ ] Run load test: `API_URL=http://localhost:8080 k6 run tests/load/k6-ai-jobs.js` +- [ ] Worker replicas increase during load +- [ ] Worker scales back to 0 after load test ends + +```bash +kubectl -n ai-devsecops get deploy ai-worker -w +``` + +## 6. Observability + +```bash +open http://localhost:9091/targets +``` + +- [ ] Prometheus scrapes `ai-api` target (UP) +- [ ] Prometheus scrapes `ai-worker` target (UP) +- [ ] Prometheus scrapes `otel-collector` target (UP) + +In Grafana: + +- [ ] Dashboard `AI-Native DevSecOps Platform` is importable +- [ ] `API Request Rate` panel shows data after sending requests +- [ ] `Job Queue Depth` panel shows queue depth +- [ ] `Worker Replica Count` panel reflects KEDA scaling + +## 7. Supply Chain Security + +```bash +trivy fs DevOps-Project-41 --severity HIGH,CRITICAL +``` + +- [ ] Trivy filesystem scan completes + +```bash +trivy config DevOps-Project-41/k8s +``` + +- [ ] Trivy config scan completes with no CRITICAL misconfigurations + +After release pipeline runs: + +```bash +cosign verify ghcr.io/GITHUB_OWNER/ai-api:1.0.0 \ + --certificate-identity-regexp ".*release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com +``` + +- [ ] Cosign verification succeeds +- [ ] SBOM artefacts appear in GitHub Actions run + +## 8. Kyverno Policies + +```bash +kubectl get clusterpolicies +``` + +- [ ] 5 policies are `Ready` + +```bash +# Should be rejected +kubectl -n ai-devsecops run test --image=nginx:latest --dry-run=server 2>&1 | grep "denied" +``` + +- [ ] `:latest` tag rejected by `restrict-latest-tag` policy + +## 9. CI Pipeline + +Check GitHub Actions tab after pushing: + +- [ ] `CI` workflow passes on push +- [ ] `.NET` tests pass +- [ ] Docker images build successfully +- [ ] `Security Scanning` workflow uploads SARIF to GitHub Security tab +- [ ] `Release` workflow (on tag) pushes images to GHCR + +## 10. Cleanup + +```bash +kind delete cluster --name ai-devsecops +docker compose down -v +``` + +- [ ] kind cluster deleted cleanly +- [ ] Docker volumes removed +- [ ] No orphan namespaces in local cluster diff --git a/DevOps-Project-41/gitops/argocd-app-dev.yaml b/DevOps-Project-41/gitops/argocd-app-dev.yaml new file mode 100644 index 00000000..11ef250c --- /dev/null +++ b/DevOps-Project-41/gitops/argocd-app-dev.yaml @@ -0,0 +1,34 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ai-native-platform-dev + namespace: argocd + labels: + app.kubernetes.io/part-of: ai-native-devsecops-platform + environment: dev + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://github.com/GITHUB_OWNER/DevOps-Projects.git + targetRevision: devops-project + path: DevOps-Project-41/k8s/overlays/dev + destination: + server: https://kubernetes.default.svc + namespace: ai-devsecops + syncPolicy: + automated: + prune: true + selfHeal: true + allowEmpty: false + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m diff --git a/DevOps-Project-41/gitops/argocd-app-prod.yaml b/DevOps-Project-41/gitops/argocd-app-prod.yaml new file mode 100644 index 00000000..86d0efad --- /dev/null +++ b/DevOps-Project-41/gitops/argocd-app-prod.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ai-native-platform-prod + namespace: argocd + labels: + app.kubernetes.io/part-of: ai-native-devsecops-platform + environment: prod + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://github.com/GITHUB_OWNER/DevOps-Projects.git + targetRevision: main + path: DevOps-Project-41/k8s/overlays/prod + destination: + server: https://kubernetes.default.svc + namespace: ai-devsecops + syncPolicy: + # Production uses manual sync — promotion requires explicit Git change + manual trigger + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + retry: + limit: 3 + backoff: + duration: 10s + factor: 2 + maxDuration: 5m diff --git a/DevOps-Project-41/infra/kind/kind-cluster.yaml b/DevOps-Project-41/infra/kind/kind-cluster.yaml new file mode 100644 index 00000000..3cd5e3bc --- /dev/null +++ b/DevOps-Project-41/infra/kind/kind-cluster.yaml @@ -0,0 +1,25 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: ai-devsecops + +nodes: + - role: control-plane + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 80 + hostPort: 8081 + protocol: TCP + - containerPort: 443 + hostPort: 8443 + protocol: TCP + - role: worker + - role: worker + +networking: + apiServerAddress: "127.0.0.1" + apiServerPort: 6443 diff --git a/DevOps-Project-41/infra/terraform/README.md b/DevOps-Project-41/infra/terraform/README.md new file mode 100644 index 00000000..635a9bc4 --- /dev/null +++ b/DevOps-Project-41/infra/terraform/README.md @@ -0,0 +1,52 @@ +# Terraform — Cloud Infrastructure + +This directory is a placeholder for cloud-managed Kubernetes cluster provisioning. + +## Purpose + +Provision a managed Kubernetes cluster on your preferred cloud provider before deploying +the AI-Native DevSecOps Platform manifests from `k8s/`. + +## Supported Providers + +Choose one module directory to create based on your target platform: + +| Provider | Module path | Cluster type | +|----------|-------------|--------------| +| AWS | `aws/` | EKS | +| Azure | `azure/` | AKS | +| GCP | `gcp/` | GKE | + +## Minimum Variables (all modules) + +| Variable | Description | Example | +|----------|-------------|---------| +| `region` | Cloud region | `eu-west-1` | +| `cluster_name` | Cluster name | `ai-devsecops` | +| `node_count` | Initial node count | `3` | +| `node_type` | VM size / instance type | `t3.medium` | + +## Usage + +```bash +cd infra/terraform/ +terraform init +terraform plan -out=tfplan +terraform apply tfplan +``` + +After the cluster is ready, update your kubeconfig and apply the Kustomize overlay: + +```bash +# AWS example +aws eks update-kubeconfig --region --name + +# Deploy +kubectl apply -k k8s/overlays/prod +``` + +## Security Notes + +- **Never commit** `*.tfstate`, `*.tfplan`, or `.terraform/` — add them to `.gitignore`. +- Use remote state (S3 + DynamoDB, Azure Blob, GCS) with state locking for team use. +- Rotate cloud credentials used by Terraform regularly; prefer OIDC/Workload Identity where available. diff --git a/DevOps-Project-41/k8s/base/api-deployment.yaml b/DevOps-Project-41/k8s/base/api-deployment.yaml new file mode 100644 index 00000000..e84a2e69 --- /dev/null +++ b/DevOps-Project-41/k8s/base/api-deployment.yaml @@ -0,0 +1,79 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-api + namespace: ai-devsecops + labels: + app.kubernetes.io/name: ai-api + app.kubernetes.io/version: "1.0.0" + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: api +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: ai-api + template: + metadata: + labels: + app.kubernetes.io/name: ai-api + app.kubernetes.io/version: "1.0.0" + app.kubernetes.io/component: api + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + serviceAccountName: ai-platform-sa + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + containers: + - name: ai-api + image: ghcr.io/GITHUB_REPOSITORY/ai-api:latest + imagePullPolicy: Always + ports: + - containerPort: 8080 + name: http + protocol: TCP + envFrom: + - configMapRef: + name: ai-platform-config + - secretRef: + name: ai-platform-secrets + optional: true + volumeMounts: + - name: tmp + mountPath: /tmp + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 20 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + volumes: + - name: tmp + emptyDir: {} diff --git a/DevOps-Project-41/k8s/base/api-service.yaml b/DevOps-Project-41/k8s/base/api-service.yaml new file mode 100644 index 00000000..6056f48f --- /dev/null +++ b/DevOps-Project-41/k8s/base/api-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: ai-api + namespace: ai-devsecops + labels: + app.kubernetes.io/name: ai-api + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: api +spec: + selector: + app.kubernetes.io/name: ai-api + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP + type: ClusterIP diff --git a/DevOps-Project-41/k8s/base/configmap.yaml b/DevOps-Project-41/k8s/base/configmap.yaml new file mode 100644 index 00000000..bc98b7b3 --- /dev/null +++ b/DevOps-Project-41/k8s/base/configmap.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ai-platform-config + namespace: ai-devsecops + labels: + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/managed-by: kustomize +data: + AI_PROVIDER: "mock" + REDIS_CONNECTION_STRING: "redis:6379" + OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector:4317" + ASPNETCORE_ENVIRONMENT: "Production" + ASPNETCORE_URLS: "http://+:8080" diff --git a/DevOps-Project-41/k8s/base/keda-scaledobject-worker.yaml b/DevOps-Project-41/k8s/base/keda-scaledobject-worker.yaml new file mode 100644 index 00000000..2857bcab --- /dev/null +++ b/DevOps-Project-41/k8s/base/keda-scaledobject-worker.yaml @@ -0,0 +1,21 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: ai-worker-scaledobject + namespace: ai-devsecops + labels: + app.kubernetes.io/name: ai-worker + app.kubernetes.io/part-of: ai-native-devsecops-platform +spec: + scaleTargetRef: + name: ai-worker + minReplicaCount: 0 + maxReplicaCount: 10 + pollingInterval: 10 + cooldownPeriod: 60 + triggers: + - type: redis + metadata: + address: redis.ai-devsecops.svc.cluster.local:6379 + listName: ai-jobs + listLength: "5" diff --git a/DevOps-Project-41/k8s/base/kustomization.yaml b/DevOps-Project-41/k8s/base/kustomization.yaml new file mode 100644 index 00000000..6d1ad1f2 --- /dev/null +++ b/DevOps-Project-41/k8s/base/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: ai-devsecops + +resources: + - namespace.yaml + - serviceaccount.yaml + - configmap.yaml + - networkpolicy.yaml + - redis-deployment.yaml + - postgres-statefulset.yaml + - api-deployment.yaml + - api-service.yaml + - worker-deployment.yaml + - worker-service.yaml + - keda-scaledobject-worker.yaml diff --git a/DevOps-Project-41/k8s/base/namespace.yaml b/DevOps-Project-41/k8s/base/namespace.yaml new file mode 100644 index 00000000..24b4775b --- /dev/null +++ b/DevOps-Project-41/k8s/base/namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ai-devsecops + labels: + app.kubernetes.io/part-of: ai-native-devsecops-platform diff --git a/DevOps-Project-41/k8s/base/networkpolicy.yaml b/DevOps-Project-41/k8s/base/networkpolicy.yaml new file mode 100644 index 00000000..81cdcebf --- /dev/null +++ b/DevOps-Project-41/k8s/base/networkpolicy.yaml @@ -0,0 +1,68 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: ai-api-netpol + namespace: ai-devsecops +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: ai-api + policyTypes: + - Ingress + - Egress + ingress: + - ports: + - port: 8080 + egress: + - ports: + - port: 6379 # Redis + - port: 5432 # PostgreSQL + - port: 4317 # OTLP gRPC + - port: 4318 # OTLP HTTP + - port: 53 # DNS + protocol: UDP +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: ai-worker-netpol + namespace: ai-devsecops +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: ai-worker + policyTypes: + - Ingress + - Egress + ingress: + - ports: + - port: 9090 # Prometheus metrics + egress: + - ports: + - port: 6379 + - port: 5432 + - port: 4317 + - port: 4318 + - port: 443 # External AI providers + - port: 11434 # Ollama + - port: 53 + protocol: UDP +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: redis-netpol + namespace: ai-devsecops +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: redis + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/part-of: ai-native-devsecops-platform + ports: + - port: 6379 diff --git a/DevOps-Project-41/k8s/base/postgres-statefulset.yaml b/DevOps-Project-41/k8s/base/postgres-statefulset.yaml new file mode 100644 index 00000000..c57beedc --- /dev/null +++ b/DevOps-Project-41/k8s/base/postgres-statefulset.yaml @@ -0,0 +1,92 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: ai-devsecops + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/version: "16" + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: database +spec: + serviceName: postgres + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: postgres + template: + metadata: + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/version: "16" + app.kubernetes.io/component: database + spec: + securityContext: + runAsNonRoot: true + runAsUser: 70 + fsGroup: 70 + containers: + - name: postgres + image: postgres:16-alpine + ports: + - containerPort: 5432 + name: postgres + env: + - name: POSTGRES_DB + value: aiops + - name: POSTGRES_USER + value: aiops + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: postgres-secret + key: password + optional: false + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "512Mi" + readinessProbe: + exec: + command: ["pg_isready", "-U", "aiops", "-d", "aiops"] + initialDelaySeconds: 10 + periodSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeClaimTemplates: + - metadata: + name: postgres-data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: ai-devsecops + labels: + app.kubernetes.io/name: postgres + app.kubernetes.io/part-of: ai-native-devsecops-platform +spec: + selector: + app.kubernetes.io/name: postgres + ports: + - port: 5432 + targetPort: 5432 + name: postgres + type: ClusterIP + clusterIP: None diff --git a/DevOps-Project-41/k8s/base/redis-deployment.yaml b/DevOps-Project-41/k8s/base/redis-deployment.yaml new file mode 100644 index 00000000..7cd562ea --- /dev/null +++ b/DevOps-Project-41/k8s/base/redis-deployment.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + namespace: ai-devsecops + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/version: "7.2" + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: cache +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: redis + template: + metadata: + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/version: "7.2" + app.kubernetes.io/component: cache + spec: + securityContext: + runAsNonRoot: true + runAsUser: 999 + containers: + - name: redis + image: redis:7.2-alpine + command: ["redis-server", "--save", "", "--appendonly", "no"] + ports: + - containerPort: 6379 + name: redis + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + readinessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL +--- +apiVersion: v1 +kind: Service +metadata: + name: redis + namespace: ai-devsecops + labels: + app.kubernetes.io/name: redis + app.kubernetes.io/part-of: ai-native-devsecops-platform +spec: + selector: + app.kubernetes.io/name: redis + ports: + - port: 6379 + targetPort: 6379 + name: redis + type: ClusterIP diff --git a/DevOps-Project-41/k8s/base/secret.example.yaml b/DevOps-Project-41/k8s/base/secret.example.yaml new file mode 100644 index 00000000..6c9da1f0 --- /dev/null +++ b/DevOps-Project-41/k8s/base/secret.example.yaml @@ -0,0 +1,16 @@ +# EXAMPLE ONLY — do not commit real secrets. +# Copy this file to secret.yaml and fill in base64-encoded values, or use +# an External Secrets Operator / Sealed Secrets integration instead. +apiVersion: v1 +kind: Secret +metadata: + name: ai-platform-secrets + namespace: ai-devsecops + labels: + app.kubernetes.io/part-of: ai-native-devsecops-platform +type: Opaque +data: + # echo -n 'Host=postgres;Database=aiops;Username=aiops;Password=CHANGEME' | base64 + POSTGRES_CONNECTION_STRING: + # echo -n 'CHANGEME' | base64 + OPENAI_API_KEY: diff --git a/DevOps-Project-41/k8s/base/serviceaccount.yaml b/DevOps-Project-41/k8s/base/serviceaccount.yaml new file mode 100644 index 00000000..b5feb34b --- /dev/null +++ b/DevOps-Project-41/k8s/base/serviceaccount.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: ai-platform-sa + namespace: ai-devsecops + labels: + app.kubernetes.io/part-of: ai-native-devsecops-platform +automountServiceAccountToken: false diff --git a/DevOps-Project-41/k8s/base/worker-deployment.yaml b/DevOps-Project-41/k8s/base/worker-deployment.yaml new file mode 100644 index 00000000..2e9a5279 --- /dev/null +++ b/DevOps-Project-41/k8s/base/worker-deployment.yaml @@ -0,0 +1,77 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-worker + namespace: ai-devsecops + labels: + app.kubernetes.io/name: ai-worker + app.kubernetes.io/version: "1.0.0" + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: worker +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: ai-worker + template: + metadata: + labels: + app.kubernetes.io/name: ai-worker + app.kubernetes.io/version: "1.0.0" + app.kubernetes.io/component: worker + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + prometheus.io/path: "/metrics" + spec: + serviceAccountName: ai-platform-sa + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + containers: + - name: ai-worker + image: ghcr.io/GITHUB_REPOSITORY/ai-worker:latest + imagePullPolicy: Always + ports: + - containerPort: 9090 + name: metrics + protocol: TCP + envFrom: + - configMapRef: + name: ai-platform-config + - secretRef: + name: ai-platform-secrets + optional: true + volumeMounts: + - name: tmp + mountPath: /tmp + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + readinessProbe: + exec: + command: ["pgrep", "-x", "dotnet"] + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + exec: + command: ["pgrep", "-x", "dotnet"] + initialDelaySeconds: 20 + periodSeconds: 30 + failureThreshold: 3 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + volumes: + - name: tmp + emptyDir: {} diff --git a/DevOps-Project-41/k8s/base/worker-service.yaml b/DevOps-Project-41/k8s/base/worker-service.yaml new file mode 100644 index 00000000..a2a24d8e --- /dev/null +++ b/DevOps-Project-41/k8s/base/worker-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: ai-worker + namespace: ai-devsecops + labels: + app.kubernetes.io/name: ai-worker + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: worker +spec: + selector: + app.kubernetes.io/name: ai-worker + ports: + - name: metrics + port: 9090 + targetPort: 9090 + protocol: TCP + type: ClusterIP diff --git a/DevOps-Project-41/k8s/overlays/dev/api-deployment-patch.yaml b/DevOps-Project-41/k8s/overlays/dev/api-deployment-patch.yaml new file mode 100644 index 00000000..9e2d3f4e --- /dev/null +++ b/DevOps-Project-41/k8s/overlays/dev/api-deployment-patch.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-api + namespace: ai-devsecops +spec: + replicas: 1 + template: + spec: + containers: + - name: ai-api + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" diff --git a/DevOps-Project-41/k8s/overlays/dev/kustomization.yaml b/DevOps-Project-41/k8s/overlays/dev/kustomization.yaml new file mode 100644 index 00000000..359ec469 --- /dev/null +++ b/DevOps-Project-41/k8s/overlays/dev/kustomization.yaml @@ -0,0 +1,38 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: ai-devsecops + +resources: + - ../../base + +patches: + - path: api-deployment-patch.yaml + - path: worker-deployment-patch.yaml + +configMapGenerator: + - name: ai-platform-config + behavior: merge + literals: + - AI_PROVIDER=mock + - ASPNETCORE_ENVIRONMENT=Development + options: + disableNameSuffixHash: true + +secretGenerator: + - name: postgres-secret + literals: + - password=aiops-dev-password + options: + disableNameSuffixHash: true + - name: ai-platform-secrets + literals: + - POSTGRES_CONNECTION_STRING=Host=postgres;Database=aiops;Username=aiops;Password=aiops-dev-password + options: + disableNameSuffixHash: true + +images: + - name: ghcr.io/GITHUB_REPOSITORY/ai-api + newTag: dev + - name: ghcr.io/GITHUB_REPOSITORY/ai-worker + newTag: dev diff --git a/DevOps-Project-41/k8s/overlays/dev/worker-deployment-patch.yaml b/DevOps-Project-41/k8s/overlays/dev/worker-deployment-patch.yaml new file mode 100644 index 00000000..ed1c1aa3 --- /dev/null +++ b/DevOps-Project-41/k8s/overlays/dev/worker-deployment-patch.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-worker + namespace: ai-devsecops +spec: + replicas: 1 + template: + spec: + containers: + - name: ai-worker + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" diff --git a/DevOps-Project-41/k8s/overlays/prod/api-deployment-patch.yaml b/DevOps-Project-41/k8s/overlays/prod/api-deployment-patch.yaml new file mode 100644 index 00000000..48cde19b --- /dev/null +++ b/DevOps-Project-41/k8s/overlays/prod/api-deployment-patch.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-api + namespace: ai-devsecops +spec: + replicas: 2 + template: + spec: + containers: + - name: ai-api + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "512Mi" + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 diff --git a/DevOps-Project-41/k8s/overlays/prod/kustomization.yaml b/DevOps-Project-41/k8s/overlays/prod/kustomization.yaml new file mode 100644 index 00000000..336ff3ca --- /dev/null +++ b/DevOps-Project-41/k8s/overlays/prod/kustomization.yaml @@ -0,0 +1,34 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: ai-devsecops + +resources: + - ../../base + +patches: + - path: api-deployment-patch.yaml + - path: worker-deployment-patch.yaml + +configMapGenerator: + - name: ai-platform-config + behavior: merge + literals: + - ASPNETCORE_ENVIRONMENT=Production + options: + disableNameSuffixHash: true + +# Production secrets must be created out-of-band before applying this overlay. +# Use External Secrets Operator or Sealed Secrets — never commit plain secrets to Git. +# +# kubectl -n ai-devsecops create secret generic postgres-secret \ +# --from-literal=password=STRONG_PASSWORD +# +# kubectl -n ai-devsecops create secret generic ai-platform-secrets \ +# --from-literal=POSTGRES_CONNECTION_STRING='Host=postgres;Database=aiops;Username=aiops;Password=STRONG_PASSWORD' + +images: + - name: ghcr.io/GITHUB_REPOSITORY/ai-api + newTag: "1.0.0" + - name: ghcr.io/GITHUB_REPOSITORY/ai-worker + newTag: "1.0.0" diff --git a/DevOps-Project-41/k8s/overlays/prod/worker-deployment-patch.yaml b/DevOps-Project-41/k8s/overlays/prod/worker-deployment-patch.yaml new file mode 100644 index 00000000..9aff518c --- /dev/null +++ b/DevOps-Project-41/k8s/overlays/prod/worker-deployment-patch.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-worker + namespace: ai-devsecops +spec: + replicas: 1 + template: + spec: + containers: + - name: ai-worker + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "512Mi" diff --git a/DevOps-Project-41/observability/grafana-dashboard.json b/DevOps-Project-41/observability/grafana-dashboard.json new file mode 100644 index 00000000..f7aa681a --- /dev/null +++ b/DevOps-Project-41/observability/grafana-dashboard.json @@ -0,0 +1,147 @@ +{ + "__inputs": [], + "__requires": [], + "annotations": { "list": [] }, + "description": "AI-Native DevSecOps Platform — operational dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "id": 1, + "title": "API Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "targets": [ + { + "expr": "rate(http_requests_received_total{job=\"ai-api\"}[1m])", + "legendFormat": "{{method}} {{path}}", + "refId": "A" + } + ] + }, + { + "id": 2, + "title": "API P95 Latency (ms)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job=\"ai-api\"}[5m])) * 1000", + "legendFormat": "p95 latency", + "refId": "A" + } + ] + }, + { + "id": 3, + "title": "Job Queue Depth", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 8 }, + "targets": [ + { + "expr": "ai_queue_depth", + "legendFormat": "Queue depth", + "refId": "A" + } + ], + "options": { "colorMode": "background", "graphMode": "area" } + }, + { + "id": 4, + "title": "Worker Replicas", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 8 }, + "targets": [ + { + "expr": "kube_deployment_status_replicas_ready{namespace=\"ai-devsecops\", deployment=\"ai-worker\"}", + "legendFormat": "Ready replicas", + "refId": "A" + } + ] + }, + { + "id": 5, + "title": "Jobs Created", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 8 }, + "targets": [ + { + "expr": "increase(ai_jobs_created_total[1h])", + "legendFormat": "Jobs (1h)", + "refId": "A" + } + ] + }, + { + "id": 6, + "title": "Job Success/Failure Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "targets": [ + { + "expr": "rate(ai_jobs_completed_total[1m])", + "legendFormat": "Completed", + "refId": "A" + }, + { + "expr": "rate(ai_jobs_failed_total[1m])", + "legendFormat": "Failed", + "refId": "B" + } + ] + }, + { + "id": 7, + "title": "AI Provider Duration (p95 ms)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(ai_job_duration_seconds_bucket[5m])) * 1000", + "legendFormat": "p95 duration", + "refId": "A" + } + ] + }, + { + "id": 8, + "title": "Redis Availability", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 20 }, + "targets": [ + { + "expr": "up{job=\"redis\"}", + "legendFormat": "Redis", + "refId": "A" + } + ], + "options": { "colorMode": "background" } + }, + { + "id": 9, + "title": "PostgreSQL Availability", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 20 }, + "targets": [ + { + "expr": "up{job=\"postgres\"}", + "legendFormat": "PostgreSQL", + "refId": "A" + } + ], + "options": { "colorMode": "background" } + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["ai", "devsecops", "platform"], + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "AI-Native DevSecOps Platform", + "uid": "ai-devsecops-platform", + "version": 1 +} diff --git a/DevOps-Project-41/observability/loki-values.yaml b/DevOps-Project-41/observability/loki-values.yaml new file mode 100644 index 00000000..b41b70b0 --- /dev/null +++ b/DevOps-Project-41/observability/loki-values.yaml @@ -0,0 +1,31 @@ +loki: + commonConfig: + replication_factor: 1 + storage: + type: filesystem + auth_enabled: false + +singleBinary: + replicas: 1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + +promtail: + enabled: true + config: + clients: + - url: http://loki:3100/loki/api/v1/push + snippets: + pipelineStages: + - json: + expressions: + traceId: traceId + level: level + - labels: + traceId: + level: diff --git a/DevOps-Project-41/observability/otel-collector-local.yaml b/DevOps-Project-41/observability/otel-collector-local.yaml new file mode 100644 index 00000000..6ab9e075 --- /dev/null +++ b/DevOps-Project-41/observability/otel-collector-local.yaml @@ -0,0 +1,33 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + +exporters: + prometheus: + endpoint: "0.0.0.0:8889" + namespace: ai_platform + debug: + verbosity: detailed + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus] + logs: + receivers: [otlp] + processors: [batch] + exporters: [debug] diff --git a/DevOps-Project-41/observability/otel-collector.yaml b/DevOps-Project-41/observability/otel-collector.yaml new file mode 100644 index 00000000..0527e343 --- /dev/null +++ b/DevOps-Project-41/observability/otel-collector.yaml @@ -0,0 +1,128 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: ai-devsecops + labels: + app.kubernetes.io/name: otel-collector + app.kubernetes.io/part-of: ai-native-devsecops-platform +data: + otel-collector-config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + timeout: 5s + send_batch_size: 1024 + memory_limiter: + limit_mib: 256 + spike_limit_mib: 64 + check_interval: 5s + + exporters: + prometheus: + endpoint: "0.0.0.0:8889" + namespace: ai_platform + debug: + verbosity: normal + otlp/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [debug, otlp/tempo] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [prometheus] + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [debug] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: ai-devsecops + labels: + app.kubernetes.io/name: otel-collector + app.kubernetes.io/version: "0.104.0" + app.kubernetes.io/part-of: ai-native-devsecops-platform + app.kubernetes.io/component: observability +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: otel-collector + template: + metadata: + labels: + app.kubernetes.io/name: otel-collector + app.kubernetes.io/component: observability + spec: + securityContext: + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.104.0 + command: ["--config=/etc/otel/otel-collector-config.yaml"] + ports: + - containerPort: 4317 + name: otlp-grpc + - containerPort: 4318 + name: otlp-http + - containerPort: 8889 + name: prometheus + volumeMounts: + - name: config + mountPath: /etc/otel + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "256Mi" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ALL] + volumes: + - name: config + configMap: + name: otel-collector-config +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: ai-devsecops + labels: + app.kubernetes.io/name: otel-collector + app.kubernetes.io/part-of: ai-native-devsecops-platform +spec: + selector: + app.kubernetes.io/name: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + - name: otlp-http + port: 4318 + targetPort: 4318 + - name: prometheus + port: 8889 + targetPort: 8889 diff --git a/DevOps-Project-41/observability/prometheus-local.yml b/DevOps-Project-41/observability/prometheus-local.yml new file mode 100644 index 00000000..c1acd8c8 --- /dev/null +++ b/DevOps-Project-41/observability/prometheus-local.yml @@ -0,0 +1,22 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] + + - job_name: ai-api + static_configs: + - targets: ["ai-api:8080"] + metrics_path: /metrics + + - job_name: ai-worker + static_configs: + - targets: ["ai-worker:9090"] + metrics_path: /metrics + + - job_name: otel-collector + static_configs: + - targets: ["otel-collector:8889"] diff --git a/DevOps-Project-41/observability/prometheus-values.yaml b/DevOps-Project-41/observability/prometheus-values.yaml new file mode 100644 index 00000000..8c24b253 --- /dev/null +++ b/DevOps-Project-41/observability/prometheus-values.yaml @@ -0,0 +1,40 @@ +prometheus: + prometheusSpec: + retention: 7d + scrapeInterval: 15s + additionalScrapeConfigs: + - job_name: ai-platform + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - ai-devsecops + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: "true" + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + +grafana: + enabled: true + adminPassword: admin + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: ai-platform + orgId: 1 + folder: AI Platform + type: file + options: + path: /var/lib/grafana/dashboards/ai-platform + dashboardsConfigMaps: + ai-platform: "grafana-dashboards" diff --git a/DevOps-Project-41/security/cosign.md b/DevOps-Project-41/security/cosign.md new file mode 100644 index 00000000..6b08a1d3 --- /dev/null +++ b/DevOps-Project-41/security/cosign.md @@ -0,0 +1,40 @@ +# Cosign — Image Signing and Verification + +This project uses **Cosign keyless signing** via GitHub Actions OIDC. No long-lived signing keys are stored. + +## How signing works + +1. The `release.yml` pipeline authenticates to Sigstore via GitHub OIDC (`id-token: write` permission). +2. Cosign signs the image digest after push to GHCR. +3. The signature and transparency log entry are stored alongside the image in GHCR. + +## Verify a signed image locally + +```bash +# Install cosign +brew install cosign # macOS +# or: curl -O -L https://github.com/sigstore/cosign/releases/latest/download/cosign-linux-amd64 && chmod +x cosign-linux-amd64 + +# Verify the api image +cosign verify \ + ghcr.io/GITHUB_OWNER/ai-native-devsecops-platform/ai-api:1.0.0 \ + --certificate-identity-regexp "https://github.com/GITHUB_OWNER/DevOps-Projects/.github/workflows/release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com + +# Verify the worker image +cosign verify \ + ghcr.io/GITHUB_OWNER/ai-native-devsecops-platform/ai-worker:1.0.0 \ + --certificate-identity-regexp "https://github.com/GITHUB_OWNER/DevOps-Projects/.github/workflows/release.yml.*" \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com +``` + +## What the output means + +A successful verification prints the certificate chain and the Rekor transparency log entry. If verification fails, the image has either not been signed or the signature does not match the expected workflow identity — do not deploy it. + +## Notes + +- Keyless signing requires no secret management. +- The OIDC identity ties the signature to this specific GitHub Actions workflow. +- The Rekor transparency log provides an immutable audit trail. +- Replace `GITHUB_OWNER` with your GitHub username or organisation. diff --git a/DevOps-Project-41/security/policies/deny-privileged-containers.yaml b/DevOps-Project-41/security/policies/deny-privileged-containers.yaml new file mode 100644 index 00000000..ea11de7b --- /dev/null +++ b/DevOps-Project-41/security/policies/deny-privileged-containers.yaml @@ -0,0 +1,27 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: deny-privileged-containers + annotations: + policies.kyverno.io/title: Deny Privileged Containers + policies.kyverno.io/category: Pod Security Standards + policies.kyverno.io/severity: high + policies.kyverno.io/description: >- + Privileged containers allow root-level access to the host and must be denied. +spec: + validationFailureAction: Enforce + background: true + rules: + - name: deny-privileged + match: + any: + - resources: + kinds: [Pod] + namespaces: [ai-devsecops] + validate: + message: "Privileged containers are not allowed." + pattern: + spec: + containers: + - =(securityContext): + =(privileged): "false" diff --git a/DevOps-Project-41/security/policies/require-labels.yaml b/DevOps-Project-41/security/policies/require-labels.yaml new file mode 100644 index 00000000..8b88554c --- /dev/null +++ b/DevOps-Project-41/security/policies/require-labels.yaml @@ -0,0 +1,27 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-app-labels + annotations: + policies.kyverno.io/title: Require Application Labels + policies.kyverno.io/category: Best Practices + policies.kyverno.io/severity: low + policies.kyverno.io/description: >- + All workloads must define app.kubernetes.io/name and app.kubernetes.io/version labels. +spec: + validationFailureAction: Enforce + background: true + rules: + - name: require-labels + match: + any: + - resources: + kinds: [Deployment, StatefulSet, DaemonSet] + namespaces: [ai-devsecops] + validate: + message: "Labels app.kubernetes.io/name and app.kubernetes.io/version are required." + pattern: + metadata: + labels: + app.kubernetes.io/name: "?*" + app.kubernetes.io/version: "?*" diff --git a/DevOps-Project-41/security/policies/require-non-root.yaml b/DevOps-Project-41/security/policies/require-non-root.yaml new file mode 100644 index 00000000..7754dcac --- /dev/null +++ b/DevOps-Project-41/security/policies/require-non-root.yaml @@ -0,0 +1,30 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-run-as-non-root + annotations: + policies.kyverno.io/title: Require Run As Non Root + policies.kyverno.io/category: Pod Security Standards + policies.kyverno.io/severity: high + policies.kyverno.io/description: >- + All containers must set runAsNonRoot=true to prevent privilege escalation. +spec: + validationFailureAction: Enforce + background: true + rules: + - name: require-non-root-pod + match: + any: + - resources: + kinds: [Pod] + namespaces: [ai-devsecops] + validate: + message: "Containers must run as non-root. Set securityContext.runAsNonRoot=true." + anyPattern: + - spec: + securityContext: + runAsNonRoot: true + - spec: + containers: + - securityContext: + runAsNonRoot: true diff --git a/DevOps-Project-41/security/policies/require-resource-limits.yaml b/DevOps-Project-41/security/policies/require-resource-limits.yaml new file mode 100644 index 00000000..2a3f70c5 --- /dev/null +++ b/DevOps-Project-41/security/policies/require-resource-limits.yaml @@ -0,0 +1,32 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-resource-limits + annotations: + policies.kyverno.io/title: Require Resource Limits + policies.kyverno.io/category: Best Practices + policies.kyverno.io/severity: medium + policies.kyverno.io/description: >- + All containers must define resource requests and limits to ensure fair scheduling. +spec: + validationFailureAction: Enforce + background: true + rules: + - name: require-limits + match: + any: + - resources: + kinds: [Pod] + namespaces: [ai-devsecops] + validate: + message: "All containers must specify resource requests and limits." + pattern: + spec: + containers: + - resources: + requests: + memory: "?*" + cpu: "?*" + limits: + memory: "?*" + cpu: "?*" diff --git a/DevOps-Project-41/security/policies/restrict-latest-tag.yaml b/DevOps-Project-41/security/policies/restrict-latest-tag.yaml new file mode 100644 index 00000000..95359a67 --- /dev/null +++ b/DevOps-Project-41/security/policies/restrict-latest-tag.yaml @@ -0,0 +1,26 @@ +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: restrict-latest-tag + annotations: + policies.kyverno.io/title: Restrict Latest Tag + policies.kyverno.io/category: Best Practices + policies.kyverno.io/severity: medium + policies.kyverno.io/description: >- + Using 'latest' image tag makes deployments non-reproducible. Require explicit version tags. +spec: + validationFailureAction: Enforce + background: true + rules: + - name: no-latest-tag + match: + any: + - resources: + kinds: [Pod] + namespaces: [ai-devsecops] + validate: + message: "Image tag 'latest' is not allowed. Use an explicit version tag (e.g., :1.0.0 or a SHA digest)." + pattern: + spec: + containers: + - image: "!*:latest" diff --git a/DevOps-Project-41/security/sbom.md b/DevOps-Project-41/security/sbom.md new file mode 100644 index 00000000..f0f4ea82 --- /dev/null +++ b/DevOps-Project-41/security/sbom.md @@ -0,0 +1,52 @@ +# SBOM — Software Bill of Materials + +This project generates SBOMs in both **SPDX** and **CycloneDX** formats as part of the release pipeline. + +## What is an SBOM? + +An SBOM is a machine-readable inventory of all software components, dependencies, and licences in a build artefact. It enables vulnerability tracking, licence compliance, and supply chain auditing. + +## Generated artefacts + +| Format | File | Description | +|--------|------|-------------| +| SPDX JSON | `sbom-api.spdx.json` | SPDX 2.3 SBOM for the AI API image | +| CycloneDX JSON | `sbom-api.cyclonedx.json` | CycloneDX 1.4 SBOM for the AI API image | +| SPDX JSON | `sbom-worker.spdx.json` | SPDX 2.3 SBOM for the AI Worker image | + +Artefacts are attached to the GitHub Actions release run and also attested to the image in GHCR using `cosign attest`. + +## Generate SBOM locally + +```bash +# Using Trivy (SPDX) +trivy image --format spdx-json --output sbom-api.spdx.json \ + ghcr.io/GITHUB_OWNER/ai-native-devsecops-platform/ai-api:1.0.0 + +# Using Trivy (CycloneDX) +trivy image --format cyclonedx --output sbom-api.cyclonedx.json \ + ghcr.io/GITHUB_OWNER/ai-native-devsecops-platform/ai-api:1.0.0 + +# Using Syft (alternative) +syft ghcr.io/GITHUB_OWNER/ai-native-devsecops-platform/ai-api:1.0.0 \ + -o spdx-json=sbom-api.spdx.json \ + -o cyclonedx-json=sbom-api.cyclonedx.json +``` + +## Inspect an SBOM + +```bash +# List all packages +cat sbom-api.spdx.json | jq '.packages[].name' | sort -u + +# Find a specific package +cat sbom-api.cyclonedx.json | jq '.components[] | select(.name == "StackExchange.Redis")' +``` + +## Attest SBOM to image (done by pipeline) + +```bash +cosign attest --predicate sbom-api.spdx.json \ + --type spdxjson \ + ghcr.io/GITHUB_OWNER/ai-native-devsecops-platform/ai-api:1.0.0 +``` diff --git a/DevOps-Project-41/security/trivy.yaml b/DevOps-Project-41/security/trivy.yaml new file mode 100644 index 00000000..27a57e3f --- /dev/null +++ b/DevOps-Project-41/security/trivy.yaml @@ -0,0 +1,22 @@ +severity: + - CRITICAL + - HIGH + +exit-code: 1 + +scanners: + - vuln + - secret + - misconfig + +vuln: + ignore-unfixed: false + +misconfig: + include-non-failures: false + +format: table + +cache: + dir: .trivy-cache + backend: fs diff --git a/DevOps-Project-41/tests/load/k6-ai-jobs.js b/DevOps-Project-41/tests/load/k6-ai-jobs.js new file mode 100644 index 00000000..dff9b7b0 --- /dev/null +++ b/DevOps-Project-41/tests/load/k6-ai-jobs.js @@ -0,0 +1,98 @@ +import http from "k6/http"; +import { check, sleep } from "k6"; +import { Counter, Rate, Trend } from "k6/metrics"; + +const jobsCreated = new Counter("ai_jobs_created"); +const jobsCompleted = new Counter("ai_jobs_completed"); +const jobsFailed = new Counter("ai_jobs_failed"); +const successRate = new Rate("ai_success_rate"); +const jobDuration = new Trend("ai_job_duration_ms", true); + +export const options = { + stages: [ + { duration: "30s", target: 5 }, // ramp up + { duration: "1m", target: 10 }, // sustained load — enough to trigger KEDA + { duration: "30s", target: 20 }, // spike + { duration: "30s", target: 0 }, // ramp down + ], + thresholds: { + http_req_failed: ["rate<0.05"], + http_req_duration: ["p(95)<2000"], + ai_success_rate: ["rate>0.90"], + }, +}; + +const BASE_URL = __ENV.API_URL || "http://localhost:8080"; + +const PROMPTS = [ + "Explain GitOps in simple terms", + "What is Kubernetes?", + "How does KEDA work?", + "What is DevSecOps?", + "Describe OpenTelemetry", + "What is a container image?", + "Explain CI/CD pipelines", + "What is a service mesh?", +]; + +function randomPrompt() { + return PROMPTS[Math.floor(Math.random() * PROMPTS.length)]; +} + +export default function () { + const askPayload = JSON.stringify({ + prompt: randomPrompt(), + model: "mock-devops-model", + }); + + const askRes = http.post(`${BASE_URL}/ask`, askPayload, { + headers: { "Content-Type": "application/json" }, + }); + + const askOk = check(askRes, { + "POST /ask status 202": (r) => r.status === 202, + "POST /ask has jobId": (r) => r.json("jobId") !== undefined, + }); + + if (!askOk) { + jobsFailed.add(1); + successRate.add(false); + return; + } + + jobsCreated.add(1); + const jobId = askRes.json("jobId"); + const start = Date.now(); + + // Poll for completion (max 10 attempts) + let completed = false; + for (let i = 0; i < 10; i++) { + sleep(1); + const jobRes = http.get(`${BASE_URL}/jobs/${jobId}`); + const status = jobRes.json("status"); + + if (status === "completed") { + completed = true; + jobsCompleted.add(1); + jobDuration.add(Date.now() - start); + successRate.add(true); + + check(jobRes, { + "completed job has result": (r) => r.json("result") !== null, + }); + break; + } + + if (status === "failed") { + jobsFailed.add(1); + successRate.add(false); + break; + } + } + + if (!completed) { + successRate.add(false); + } + + sleep(0.5); +} diff --git a/DevOps-Project-41/tests/smoke/smoke-test.sh b/DevOps-Project-41/tests/smoke/smoke-test.sh new file mode 100755 index 00000000..50649520 --- /dev/null +++ b/DevOps-Project-41/tests/smoke/smoke-test.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +set -euo pipefail + +API_URL="${API_URL:-http://localhost:8080}" +MAX_WAIT=30 +PASS=0 +FAIL=0 + +green() { printf "\e[32m[PASS]\e[0m %s\n" "$*"; } +red() { printf "\e[31m[FAIL]\e[0m %s\n" "$*"; } +info() { printf "\e[34m[INFO]\e[0m %s\n" "$*"; } + +assert_eq() { + local label="$1" expected="$2" actual="$3" + if [[ "$actual" == "$expected" ]]; then + green "$label" + ((PASS++)) + else + red "$label — expected '$expected', got '$actual'" + ((FAIL++)) + fi +} + +assert_contains() { + local label="$1" needle="$2" haystack="$3" + if echo "$haystack" | grep -q "$needle"; then + green "$label" + ((PASS++)) + else + red "$label — expected to contain '$needle'" + ((FAIL++)) + fi +} + +info "Smoke test against: $API_URL" +echo "" + +# ── /health ──────────────────────────────────────────────────────────────────── +info "Testing /health" +health_status=$(curl -s -o /dev/null -w "%{http_code}" "$API_URL/health") +assert_eq "GET /health returns 200" "200" "$health_status" + +# ── /ready ───────────────────────────────────────────────────────────────────── +info "Testing /ready" +ready_status=$(curl -s -o /dev/null -w "%{http_code}" "$API_URL/ready") +if [[ "$ready_status" == "200" || "$ready_status" == "503" ]]; then + green "GET /ready returns 200 or 503 (degraded acceptable)" + ((PASS++)) +else + red "GET /ready returned unexpected status: $ready_status" + ((FAIL++)) +fi + +# ── POST /ask ────────────────────────────────────────────────────────────────── +info "Testing POST /ask" +ask_response=$(curl -s -X POST "$API_URL/ask" \ + -H "Content-Type: application/json" \ + -d '{"prompt":"Explain GitOps in simple terms","model":"mock-devops-model"}') + +ask_status=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$API_URL/ask" \ + -H "Content-Type: application/json" \ + -d '{"prompt":"second job","model":"mock-devops-model"}') + +assert_contains "POST /ask returns jobId" "jobId" "$ask_response" +assert_contains "POST /ask returns status=queued" "queued" "$ask_response" + +JOB_ID=$(echo "$ask_response" | grep -o '"jobId":"[^"]*"' | cut -d'"' -f4) +info "Job created: $JOB_ID" + +# ── GET /jobs/{jobId} — poll until completed ─────────────────────────────────── +info "Polling /jobs/$JOB_ID (max ${MAX_WAIT}s)" +elapsed=0 +job_status="queued" +while [[ "$job_status" != "completed" && "$job_status" != "failed" && $elapsed -lt $MAX_WAIT ]]; do + sleep 2 + elapsed=$((elapsed + 2)) + job_response=$(curl -s "$API_URL/jobs/$JOB_ID") + job_status=$(echo "$job_response" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) + info " status=$job_status (${elapsed}s elapsed)" +done + +assert_eq "Job completes with status=completed" "completed" "$job_status" +assert_contains "Completed job has a result" "result" "$job_response" + +# ── GET /jobs/{unknown} ──────────────────────────────────────────────────────── +info "Testing 404 for unknown job" +not_found_status=$(curl -s -o /dev/null -w "%{http_code}" "$API_URL/jobs/nonexistent-job-id") +assert_eq "GET /jobs/nonexistent returns 404" "404" "$not_found_status" + +# ── POST /ask with empty prompt ──────────────────────────────────────────────── +info "Testing validation" +bad_req_status=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$API_URL/ask" \ + -H "Content-Type: application/json" \ + -d '{"prompt":"","model":"mock-devops-model"}') +assert_eq "POST /ask with empty prompt returns 400" "400" "$bad_req_status" + +# ── Summary ──────────────────────────────────────────────────────────────────── +echo "" +echo "────────────────────────────────────────" +info "Results: PASS=$PASS FAIL=$FAIL" +echo "────────────────────────────────────────" + +if [[ $FAIL -gt 0 ]]; then + red "Smoke test FAILED" + exit 1 +else + green "All smoke tests PASSED" + exit 0 +fi diff --git a/README.md b/README.md index c8a93801..0bb0e100 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,7 @@ For comprehensive AWS-specific projects and hands-on learning experiences, visit | **38** | Automated Testing in CI/CD Pipeline | Selenium, JUnit, Jest, Jenkins, Docker, Kubernetes | Integrate automated testing (unit, integration, E2E) in CI/CD | | **39** | Service Mesh Implementation with Istio | Istio, Kubernetes, Prometheus, Grafana, Kiali, Jaeger | Implement service mesh for microservices with observability | | **40** | Cloud Migration Strategy and Execution | AWS Migration Hub, Database Migration Service, Terraform, Jenkins | Plan and execute cloud migration from on-premises to AWS | +| 41 | AI-Native DevSecOps Platform with GitOps and Observability | GitHub Actions, Docker, Kubernetes, Argo CD, KEDA, OpenTelemetry, Prometheus, Grafana, Trivy, Cosign, Terraform | End-to-end platform for deploying AI-ready applications with GitOps, autoscaling, observability, SBOM generation, image signing and DevSecOps controls | ![Projects Overview](https://img.shields.io/badge/📊%20Projects%20Overview-40%20Complete%20DevOps%20Projects-orange?style=for-the-badge&logo=github&logoColor=white)