openapi: 3.1.0

info:
  title: Jope Inference Server · Management API
  version: 0.1.0
  summary: REST management channel for model lifecycle and training jobs.
  description: |
    Cold-path management interface of the **Jope Inference Server** — the stateless
    Python process that serves Raman-spectrum-to-concentration predictions for the
    Jope.SMB Operator Console.

    * **Hot path** (per-scan prediction) is served over a separate **ZMQ REQ-REP**
      socket on port `5555`. See the AsyncAPI specification for that channel.
    * **Cold path** (this API) handles hot-swap model loading, training-job
      lifecycle, and health probes over plain HTTP/1.1.

    All endpoints return JSON. Requests that can fail include a machine-readable
    `error` envelope (see the `ErrorResponse` schema).
  contact:
    name: Jope Technology Co., Ltd.
    url: https://jope-docs.pages.dev
  license:
    name: Proprietary — Jope internal use only
  termsOfService: https://jope-docs.pages.dev/

servers:
  - url: http://{inferenceHost}:5556
    description: Plant-LAN Inference Host (production)
    variables:
      inferenceHost:
        default: 10.0.1.42
        description: IP or DNS name of the dedicated Inference Host on the plant LAN.
  - url: https://jope-docs.pages.dev/mock-api
    description: Try-It mock server (used by the API Explorer)

tags:
  - name: Models
    description: Model registry — list available models, hot-swap the active one, inspect metadata.
  - name: Training
    description: Training jobs — start, poll progress, retrieve metrics on completion.
  - name: Health
    description: Liveness / readiness probes.

paths:
  /model/load:
    post:
      tags: [Models]
      operationId: loadModel
      summary: Hot-swap the active model
      description: |
        Replace the currently active model without stopping the server.
        After success, the next `predict` call over ZMQ will be served by the
        newly loaded model.

        **Atomicity guarantee** — the swap is atomic. Any in-flight `predict`
        finishes against the previous model; the next call uses the new one.

        **Audit** — the Operator Console records this event in the hash-chained
        audit trail with the signing operator's identity (21 CFR Part 11).
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/LoadModelRequest'
            examples:
              staged:
                summary: Promote a staged model to active
                value: {version: v6.2, source: staged}
              fromFile:
                summary: Load a specific registry file
                value: {version: v6.2, source: file, path: /var/lib/jope/models/v6.2.joblib}
      responses:
        '200':
          description: Model loaded successfully
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/LoadModelResponse'
              example:
                loaded_version: v6.2
                previous_version: v5
                load_time_ms: 234
                algorithm: PLS+Ridge
        '400':
          description: Version not found or model file corrupted
          content:
            application/json:
              schema: {$ref: '#/components/schemas/ErrorResponse'}
              example: {code: MODEL_NOT_FOUND, message: "Model version 'v9.9' does not exist in registry."}
        '409':
          description: Another load request is already in progress
          content:
            application/json:
              schema: {$ref: '#/components/schemas/ErrorResponse'}
              example: {code: LOAD_IN_PROGRESS, message: "Previous /model/load still running. Try again in 1s."}

  /model/list:
    get:
      tags: [Models]
      operationId: listModels
      summary: List all models in the registry
      description: |
        Returns every model file currently stored in the Inference Host registry,
        along with which one is active. Sorted by `trained_at` descending.
      responses:
        '200':
          description: List of models
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ModelList'
              example:
                active: v5
                models:
                  - {version: v6.2, status: staged, trained_at: "2026-04-20T11:12:00Z", trained_samples: 210, algorithm: PLS+Ridge}
                  - {version: v5,   status: active, trained_at: "2026-03-15T10:00:00Z", trained_samples: 187, algorithm: PLS+Ridge}
                  - {version: v4,   status: archived, trained_at: "2026-02-08T09:00:00Z", trained_samples: 160, algorithm: PLS}

  /model/info:
    get:
      tags: [Models]
      operationId: getActiveModelInfo
      summary: Get active model metadata
      description: Full metadata of the currently active model — algorithm, training metrics, wavenumber range.
      responses:
        '200':
          description: Active model metadata
          content:
            application/json:
              schema: {$ref: '#/components/schemas/ModelInfo'}
              example:
                active_version: v5
                algorithm: PLS+Ridge
                latent_components: 5
                trained_at: "2026-03-15T10:00:00Z"
                trained_samples: 187
                rmse: {EPA: 0.047, DHA: 0.052, DPA: 0.061}
                r2:   {EPA: 0.958, DHA: 0.942, DPA: 0.913}
                wavenumber_range: [200.0, 3200.0]
                wavenumber_count: 2048

  /training/start:
    post:
      tags: [Training]
      operationId: startTraining
      summary: Kick off an async training job
      description: |
        Uploads a CSV training set (Raman spectra + HPLC labels) and a JSON
        metadata blob, then asynchronously starts a training pipeline on the
        Inference Host.

        Returns `202 Accepted` with a `job_id` — poll `/training/{jobId}` for
        progress and final metrics.
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              required: [csv, meta]
              properties:
                csv:
                  type: string
                  format: binary
                  description: |
                    Training set CSV. Columns: `scan_id, wn_0, wn_1, ..., wn_2047,
                    EPA_g_per_L, DHA_g_per_L, DPA_g_per_L`.
                meta:
                  type: string
                  format: json
                  description: JSON metadata, see `TrainingMeta` schema.
            encoding:
              meta:
                contentType: application/json
      responses:
        '202':
          description: Training job accepted
          content:
            application/json:
              schema: {$ref: '#/components/schemas/TrainingJobAccepted'}
              example:
                job_id: train-2026-04-22-001
                estimated_duration_seconds: 180
                status_url: /training/train-2026-04-22-001
        '400':
          description: Invalid CSV shape or metadata
          content:
            application/json:
              schema: {$ref: '#/components/schemas/ErrorResponse'}

  /training/{jobId}:
    get:
      tags: [Training]
      operationId: getTrainingJob
      summary: Poll training job progress
      description: |
        Returns current status. On `done`, `result` contains the new model
        version and validation metrics.
      parameters:
        - in: path
          name: jobId
          required: true
          schema: {type: string}
          example: train-2026-04-22-001
      responses:
        '200':
          description: Training job status
          content:
            application/json:
              schema: {$ref: '#/components/schemas/TrainingJob'}
              examples:
                running:
                  summary: In progress
                  value:
                    job_id: train-2026-04-22-001
                    status: running
                    progress: 0.47
                    elapsed_seconds: 84
                    result: null
                done:
                  summary: Completed successfully
                  value:
                    job_id: train-2026-04-22-001
                    status: done
                    progress: 1.0
                    elapsed_seconds: 172
                    result:
                      new_version: v6.3
                      metrics:
                        rmse: {EPA: 0.041, DHA: 0.048, DPA: 0.055}
                        r2:   {EPA: 0.963, DHA: 0.948, DPA: 0.921}
                      artifact_path: /var/lib/jope/models/v6.3.joblib
                failed:
                  summary: Failed
                  value:
                    job_id: train-2026-04-22-001
                    status: failed
                    progress: 0.33
                    elapsed_seconds: 58
                    error:
                      code: TRAINING_DIVERGED
                      message: "RMSE did not converge after 5 latent-component sweeps."
                    result: null
        '404':
          description: Job id not found
          content:
            application/json:
              schema: {$ref: '#/components/schemas/ErrorResponse'}

  /health:
    get:
      tags: [Health]
      operationId: getHealth
      summary: Liveness probe
      description: |
        Kubernetes / systemd-style health check. Returns `200` if the server
        is up. Use `model_loaded` to verify the server is ready to serve
        `predict` — a `200` with `model_loaded: false` means ZMQ requests
        will return `MODEL_NOT_LOADED` errors.
      responses:
        '200':
          description: Server is alive
          content:
            application/json:
              schema: {$ref: '#/components/schemas/HealthStatus'}
              example:
                status: ok
                model_loaded: true
                active_version: v5
                uptime_seconds: 3847
                server_version: 0.3.1
                protocol_version: 1
                python_version: 3.11.4

components:
  schemas:

    LoadModelRequest:
      type: object
      required: [version]
      properties:
        version:
          type: string
          description: Target model version (must exist in registry).
          example: v6.2
        source:
          type: string
          enum: [staged, archived, file]
          default: staged
          description: |
            * `staged` — promote a staged model.
            * `archived` — re-activate an archived model.
            * `file` — load an arbitrary `.joblib` file (requires `path`).
        path:
          type: string
          description: Absolute path to a `.joblib` file (only when `source=file`).
          example: /var/lib/jope/models/v6.2.joblib

    LoadModelResponse:
      type: object
      required: [loaded_version, previous_version, load_time_ms]
      properties:
        loaded_version: {type: string, example: v6.2}
        previous_version: {type: string, example: v5}
        load_time_ms:
          type: integer
          minimum: 0
          description: End-to-end load time measured server-side.
          example: 234
        algorithm:
          type: string
          example: PLS+Ridge

    ModelVersion:
      type: object
      required: [version, status, trained_at]
      properties:
        version: {type: string, example: v6.2}
        status:
          type: string
          enum: [active, staged, archived]
          example: staged
        trained_at:
          type: string
          format: date-time
          example: "2026-04-20T11:12:00Z"
        trained_samples:
          type: integer
          minimum: 0
          example: 210
        algorithm:
          type: string
          example: PLS+Ridge
        size_kb:
          type: integer
          minimum: 0
          example: 48

    ModelList:
      type: object
      required: [active, models]
      properties:
        active:
          type: string
          nullable: true
          description: Currently active version, `null` if none loaded.
          example: v5
        models:
          type: array
          items: {$ref: '#/components/schemas/ModelVersion'}

    MetricBreakdown:
      type: object
      description: Per-component metric (EPA / DHA / DPA).
      properties:
        EPA: {type: number, format: float, example: 0.047}
        DHA: {type: number, format: float, example: 0.052}
        DPA: {type: number, format: float, example: 0.061}

    ModelInfo:
      type: object
      required: [active_version, algorithm, trained_at]
      properties:
        active_version: {type: string, example: v5}
        algorithm: {type: string, example: PLS+Ridge}
        latent_components: {type: integer, minimum: 1, example: 5}
        trained_at: {type: string, format: date-time}
        trained_samples: {type: integer, minimum: 0, example: 187}
        rmse: {$ref: '#/components/schemas/MetricBreakdown'}
        r2:   {$ref: '#/components/schemas/MetricBreakdown'}
        wavenumber_range:
          type: array
          items: {type: number}
          minItems: 2
          maxItems: 2
          example: [200.0, 3200.0]
        wavenumber_count: {type: integer, example: 2048}

    TrainingMeta:
      type: object
      required: [operator]
      properties:
        source_batches:
          type: array
          items: {type: string}
          example: [PR-2026-0487, PR-2026-0489]
        operator:
          type: string
          description: Operator user id.
          example: alice.lin
        notes:
          type: string
          example: "Additional EPA-rich samples from 4/20 batch."

    TrainingJobAccepted:
      type: object
      required: [job_id]
      properties:
        job_id: {type: string, example: train-2026-04-22-001}
        estimated_duration_seconds: {type: integer, example: 180}
        status_url:
          type: string
          example: /training/train-2026-04-22-001

    TrainingJob:
      type: object
      required: [job_id, status, progress]
      properties:
        job_id: {type: string}
        status:
          type: string
          enum: [queued, running, done, failed]
        progress:
          type: number
          minimum: 0
          maximum: 1
          example: 0.47
        elapsed_seconds: {type: integer, minimum: 0}
        result:
          type: object
          nullable: true
          properties:
            new_version: {type: string, example: v6.3}
            metrics:
              type: object
              properties:
                rmse: {$ref: '#/components/schemas/MetricBreakdown'}
                r2:   {$ref: '#/components/schemas/MetricBreakdown'}
            artifact_path: {type: string}
        error:
          type: object
          nullable: true
          properties:
            code: {type: string}
            message: {type: string}

    HealthStatus:
      type: object
      required: [status]
      properties:
        status:
          type: string
          enum: [ok, degraded]
          example: ok
        model_loaded: {type: boolean}
        active_version:
          type: string
          nullable: true
        uptime_seconds: {type: integer, minimum: 0}
        server_version: {type: string, example: 0.3.1}
        protocol_version: {type: integer, example: 1}
        python_version: {type: string, example: 3.11.4}

    ErrorResponse:
      type: object
      required: [code, message]
      properties:
        code:
          type: string
          description: |
            Machine-readable error code. Common values:
            `MODEL_NOT_FOUND`, `MODEL_CORRUPTED`, `LOAD_IN_PROGRESS`,
            `INVALID_CSV`, `TRAINING_DIVERGED`, `INTERNAL_ERROR`.
          example: MODEL_NOT_FOUND
        message:
          type: string
          description: Human-readable explanation.
        retryable:
          type: boolean
          default: false