{
  "id": "ft-validate-08",
  "meta": {
    "instanceId": "vorlux-hub"
  },
  "name": "Vorlux AI | Fine-Tune Dataset Validator (Weekly)",
  "active": true,
  "nodes": [
    {
      "id": "b8c9d0e1-0008-4bbb-8008-000000000001",
      "name": "Weekly Saturday",
      "type": "n8n-nodes-base.scheduleTrigger",
      "typeVersion": 1.2,
      "position": [220, 300],
      "parameters": {
        "rule": {
          "interval": [
            {
              "field": "hours",
              "hoursInterval": 168
            }
          ]
        }
      }
    },
    {
      "id": "b8c9d0e1-0008-4bbb-8008-000000000002",
      "name": "Read Training Files",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [460, 300],
      "parameters": {
        "method": "GET",
        "url": "={{$env.VORLUX_HUB_URL}}/api/admin/finetune/dataset?action=read_all",
        "options": {
          "timeout": 60000
        }
      },
      "notes": "Reads all JSONL training files from config/training/"
    },
    {
      "id": "b8c9d0e1-0008-4bbb-8008-000000000003",
      "name": "Validate Dataset",
      "type": "n8n-nodes-base.code",
      "typeVersion": 2,
      "position": [700, 300],
      "notes": "Checks JSON validity, field completeness, output length, duplicates, quality distribution",
      "parameters": {
        "mode": "runOnceForAllItems",
        "jsCode": "const files = $input.first().json.data || {};\nconst report = { files: {}, totals: { valid: 0, invalid: 0, duplicates: 0, tooShort: 0, tooLong: 0 }, flagged: [] };\nconst seenInstructions = new Set();\nconst seenOutputHashes = new Set();\n\nfor (const [filename, lines] of Object.entries(files)) {\n  const fileReport = { total: 0, valid: 0, invalid: 0, duplicateInst: 0, tooShort: 0, tooLong: 0, avgOutputLen: 0, issues: [] };\n  let totalOutputLen = 0;\n  \n  for (let i = 0; i < (lines || []).length; i++) {\n    fileReport.total++;\n    const line = lines[i];\n    \n    // Check JSON validity\n    if (!line || typeof line !== 'object') {\n      fileReport.invalid++;\n      fileReport.issues.push({ line: i, issue: 'invalid_json' });\n      continue;\n    }\n    \n    // Check required fields\n    if (!line.instruction || line.instruction.trim().length === 0) {\n      fileReport.invalid++;\n      fileReport.issues.push({ line: i, issue: 'missing_instruction' });\n      continue;\n    }\n    if (!line.output || line.output.trim().length === 0) {\n      fileReport.invalid++;\n      fileReport.issues.push({ line: i, issue: 'missing_output' });\n      continue;\n    }\n    \n    // Check output length\n    const outputLen = line.output.length;\n    totalOutputLen += outputLen;\n    if (outputLen < 50) {\n      fileReport.tooShort++;\n      report.flagged.push({ file: filename, line: i, issue: 'output_too_short', len: outputLen });\n    }\n    if (outputLen > 50000) {\n      fileReport.tooLong++;\n      report.flagged.push({ file: filename, line: i, issue: 'output_too_long', len: outputLen });\n    }\n    \n    // Check duplicates\n    const instKey = line.instruction.toLowerCase().trim().substring(0, 200);\n    if (seenInstructions.has(instKey)) {\n      fileReport.duplicateInst++;\n      report.totals.duplicates++;\n    } else {\n      seenInstructions.add(instKey);\n    }\n    \n    // Check output hash for exact duplicates\n    const outputHash = line.output.substring(0, 500);\n    if (seenOutputHashes.has(outputHash)) {\n      report.totals.duplicates++;\n    } else {\n      seenOutputHashes.add(outputHash);\n    }\n    \n    fileReport.valid++;\n  }\n  \n  fileReport.avgOutputLen = fileReport.total ? Math.round(totalOutputLen / fileReport.total) : 0;\n  report.files[filename] = fileReport;\n  report.totals.valid += fileReport.valid;\n  report.totals.invalid += fileReport.invalid;\n  report.totals.tooShort += fileReport.tooShort;\n  report.totals.tooLong += fileReport.tooLong;\n}\n\nreport.totalExamples = report.totals.valid + report.totals.invalid;\nreport.qualityScore = report.totalExamples ? Math.round((report.totals.valid / report.totalExamples) * 100) : 0;\nreport.flaggedCount = report.flagged.length;\nreport.flagged = report.flagged.slice(0, 20); // Limit flagged entries\n\nreturn [{ json: report }];"
      }
    },
    {
      "id": "b8c9d0e1-0008-4bbb-8008-000000000004",
      "name": "Store Quality Report",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [960, 200],
      "parameters": {
        "method": "POST",
        "url": "={{$env.VORLUX_HUB_URL}}/api/admin/finetune/quality-report",
        "sendBody": true,
        "specifyBody": "json",
        "jsonBody": "={{ JSON.stringify({ report: $json, timestamp: new Date().toISOString() }) }}",
        "options": {
          "timeout": 15000
        }
      }
    },
    {
      "id": "b8c9d0e1-0008-4bbb-8008-000000000005",
      "name": "Discord Report",
      "type": "n8n-nodes-base.httpRequest",
      "typeVersion": 4.2,
      "position": [960, 400],
      "parameters": {
        "method": "POST",
        "url": "={{$env.DISCORD_OPS_WEBHOOK}}",
        "sendBody": true,
        "specifyBody": "json",
        "jsonBody": "={\"embeds\":[{\"title\":\"Dataset Validation Report\",\"description\":\"**Quality Score: {{ $json.qualityScore }}%**\\n\\nTotal examples: {{ $json.totalExamples }}\\nValid: {{ $json.totals.valid }}\\nInvalid: {{ $json.totals.invalid }}\\nDuplicates: {{ $json.totals.duplicates }}\\nToo short: {{ $json.totals.tooShort }}\\nToo long: {{ $json.totals.tooLong }}\\n\\n**Files:**\\n{{ Object.entries($json.files || {}).map(([f,r]) => f + ': ' + r.valid + '/' + r.total + ' valid (avg ' + r.avgOutputLen + ' chars)').join('\\\\n') }}\\n\\n**Flagged entries:** {{ $json.flaggedCount }}\",\"color\":{{ $json.qualityScore > 90 ? 5763719 : ($json.qualityScore > 70 ? 16776960 : 15548997) }},\"footer\":{\"text\":\"Weekly Dataset Validator\"}}]}",
        "options": {
          "timeout": 10000
        }
      }
    }
  ],
  "connections": {
    "Weekly Saturday": {
      "main": [
        [
          {
            "node": "Read Training Files",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Read Training Files": {
      "main": [
        [
          {
            "node": "Validate Dataset",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Validate Dataset": {
      "main": [
        [
          {
            "node": "Store Quality Report",
            "type": "main",
            "index": 0
          },
          {
            "node": "Discord Report",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  },
  "settings": {
    "executionOrder": "v1",
    "saveManualExecutions": true,
    "saveExecutionProgress": true
  },
  "tags": [
    { "name": "ai" },
    { "name": "finetune" },
    { "name": "validation" }
  ],
  "versionId": "2"
}