{
  "apiVersion": "1",
  "version": "2026-07-09-gso-ml-domain",
  "summary": "Composite IQ uses seven scored dimensions mapped through IQ 70-160 expected-score ladders, conservative imputation rules, and chart display policies documented on the methodology page. Effective Cost uses positive input/output pricing plus a measured-or-imputed usage multiplier waterfall documented on the methodology page. Emotional Reasoning (EQ) is retained as an experimental diagnostic metric and is excluded from Composite IQ until its benchmark base becomes more rigorous.",
  "derivedRankings": [
    {
      "id": "composite-iq",
      "rankingName": "Composite IQ",
      "direction": "higher_is_better",
      "scoreField": "iq",
      "dimensions": [
        {
          "id": "D1",
          "slug": "abstract-reasoning",
          "name": "Abstract Reasoning",
          "minBenchmarks": 1,
          "benchmarks": [
            {
              "field": "arcAgi3",
              "name": "ARC-AGI-3"
            },
            {
              "field": "arcAgi2",
              "name": "ARC-AGI-2"
            },
            {
              "field": "arcAgi1",
              "name": "ARC-AGI-1"
            }
          ]
        },
        {
          "id": "D2",
          "slug": "mathematical-reasoning",
          "name": "Mathematical Reasoning",
          "minBenchmarks": 1,
          "benchmarks": [
            {
              "field": "fmT4Acc",
              "name": "FrontierMath Tier 4"
            },
            {
              "field": "fmT13Acc",
              "name": "FrontierMath Tier 1-3"
            },
            {
              "field": "proofbench",
              "name": "ProofBench"
            },
            {
              "field": "mathArena",
              "name": "MathArena"
            },
            {
              "field": "aime",
              "name": "AIME"
            }
          ]
        },
        {
          "id": "D3",
          "slug": "scientific-reasoning",
          "name": "Scientific Reasoning",
          "minBenchmarks": 1,
          "benchmarks": [
            {
              "field": "hle",
              "name": "Humanity's Last Exam"
            },
            {
              "field": "critPt",
              "name": "CritPt"
            },
            {
              "field": "sciCode",
              "name": "SciCode"
            },
            {
              "field": "gpqa",
              "name": "GPQA Diamond"
            }
          ]
        },
        {
          "id": "D4",
          "slug": "frontend-engineering",
          "name": "Frontend Engineering",
          "minBenchmarks": 2,
          "benchmarks": [
            {
              "field": "webdevCodeArena",
              "name": "Arena.ai WebDev"
            },
            {
              "field": "designArenaFrontend",
              "name": "DesignArena Frontend"
            },
            {
              "field": "designArenaFullstack",
              "name": "DesignArena Full Stack"
            },
            {
              "field": "vibeCodeBench",
              "name": "Vibe Code Bench"
            }
          ]
        },
        {
          "id": "D5",
          "slug": "backend-engineering",
          "name": "Backend Engineering",
          "minBenchmarks": 2,
          "benchmarks": [
            {
              "field": "livecodebench",
              "name": "LiveCodeBench"
            },
            {
              "field": "frontierCodeDiamond",
              "name": "FrontierCode Diamond"
            },
            {
              "field": "apexSWE",
              "name": "apexSWE"
            },
            {
              "field": "swebench",
              "name": "SWE-Bench Verified"
            },
            {
              "field": "swebenchPro",
              "name": "SWE-Bench Pro"
            },
            {
              "field": "deepSWEV11",
              "name": "DeepSWE v1.1"
            },
            {
              "field": "deepSWE",
              "name": "DeepSWE"
            },
            {
              "field": "sweRebench",
              "name": "SWE-rebench"
            },
            {
              "field": "sweMarathon",
              "name": "SWE Marathon"
            }
          ]
        },
        {
          "id": "D6",
          "slug": "computer-use",
          "name": "Computer Use",
          "minBenchmarks": 2,
          "benchmarks": [
            {
              "field": "terminalbench21",
              "name": "Terminal-Bench 2.1"
            },
            {
              "field": "terminalbenchHard",
              "name": "Terminal-Bench Hard"
            },
            {
              "field": "browseComp",
              "name": "BrowseComp"
            },
            {
              "field": "osworldVerified",
              "name": "OSWorld-Verified"
            },
            {
              "field": "toolathlon",
              "name": "Toolathlon"
            },
            {
              "field": "mcpAtlas",
              "name": "MCP Atlas"
            },
            {
              "field": "agentsLastExam",
              "name": "Agents' Last Exam"
            }
          ]
        },
        {
          "id": "D7",
          "slug": "reliability",
          "name": "Reliability",
          "minBenchmarks": 1,
          "benchmarks": [
            {
              "field": "ifBench",
              "name": "IFBench"
            },
            {
              "field": "aaOmniscience",
              "name": "aaOmniscience"
            },
            {
              "field": "bullshitBench",
              "name": "BullshitBench v2"
            },
            {
              "field": "aaLCR",
              "name": "AA Long Chain Reasoning"
            },
            {
              "field": "factsGrounding",
              "name": "FACTS Grounding"
            }
          ]
        }
      ]
    },
    {
      "id": "effective-cost",
      "rankingName": "Effective Cost",
      "direction": "lower_is_better",
      "scoreField": "effectiveCost",
      "unit": "USD per 1M I/O Tokens",
      "breakdown": [
        {
          "id": "published-pricing",
          "name": "Published Token Pricing",
          "inputs": [
            {
              "field": "inP",
              "name": "Input token price"
            },
            {
              "field": "outP",
              "name": "Output token price"
            },
            {
              "field": "cacheReadP",
              "name": "Cache read token price"
            },
            {
              "field": "cacheWriteP",
              "name": "Cache write token price"
            },
            {
              "field": "cacheWrite1hP",
              "name": "1-hour cache write token price"
            },
            {
              "field": "cacheStorageHourlyP",
              "name": "Cache storage hourly token price"
            }
          ],
          "summary": "Base effective cost still uses input price plus output price for 1M input tokens and 1M output tokens. Cache read/write/storage fields are exposed separately for cache-aware task-cost views and provider-specific cost decomposition."
        },
        {
          "id": "observed-token-usage",
          "name": "Observed Token Usage",
          "inputs": [
            {
              "field": "aaTokensM",
              "name": "Artificial Analysis token usage"
            },
            {
              "field": "aaInputTokensM",
              "name": "Artificial Analysis input token usage"
            },
            {
              "field": "aaOutputTokensM",
              "name": "Artificial Analysis output token usage"
            }
          ],
          "summary": "Validated token usage calibrates the median workload multiplier when available."
        },
        {
          "id": "task-cost-residuals",
          "name": "Task Cost Residuals",
          "benchmarks": [
            {
              "field": "arcCostPerTask",
              "name": "ARC-AGI cost per task"
            },
            {
              "field": "valsCostPerTest",
              "name": "VALS cost per test"
            },
            {
              "field": "swebenchCostPerTask",
              "name": "SWE-Bench cost per task"
            },
            {
              "field": "hleCostPerQuestion",
              "name": "Humanity's Last Exam cost per question"
            }
          ],
          "summary": "Task-level cost benchmarks adjust for observed cost differences not explained by published token prices alone."
        },
        {
          "id": "usage-multiplier-waterfall",
          "name": "Usage Multiplier Waterfall",
          "steps": [
            "measured benchmark multiplier from validated token usage and price-adjusted task-cost residuals",
            "one-generation-back same-family same-lineage multiplier",
            "two-generations-back same-family same-lineage multiplier",
            "geometric average of the three closest measured peers",
            "assumed 1x fallback for positive-price models"
          ],
          "summary": "Effective cost is sticker price multiplied by the first available usage multiplier in this waterfall."
        }
      ]
    }
  ],
  "updatedAt": "2026-07-13T21:57:32.936Z",
  "url": "https://www.aiiq.org/methodology/"
}