mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
2740 lines
82 KiB
HTML
2740 lines
82 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>SkillOpt | Executive Strategy for Self-Evolving Agent Skills</title>
|
|
<meta name="description" content="Project webpage for SkillOpt, a text-space optimizer that trains reusable natural-language skills for frozen language agents.">
|
|
<link rel="icon" type="image/svg+xml" href="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 23 23'%3E%3Crect width='10' height='10' fill='%23F25022'/%3E%3Crect x='13' width='10' height='10' fill='%237FBA00'/%3E%3Crect y='13' width='10' height='10' fill='%2300A4EF'/%3E%3Crect x='13' y='13' width='10' height='10' fill='%23FFB900'/%3E%3C/svg%3E">
|
|
<style>
|
|
@import url('https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght,SOFT@9..144,300..900,0..100&family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600;700&display=swap');
|
|
|
|
:root {
|
|
--paper: #f8f9fc;
|
|
--paper-2: #eef2ff;
|
|
--ink: #1e293b;
|
|
--muted: #64748b;
|
|
--quiet: #94a3b8;
|
|
--line: #e2e8f0;
|
|
--line-strong: #c7d2fe;
|
|
--panel: #ffffff;
|
|
--panel-warm: #fff7ed;
|
|
--blue: #4f46e5;
|
|
--teal: #0ea5e9;
|
|
--red: #ec4899;
|
|
--gold: #f59e0b;
|
|
--green: #16a34a;
|
|
--black: #0f172a;
|
|
--violet: #7c3aed;
|
|
--shadow: 0 12px 30px rgba(79, 70, 229, 0.10);
|
|
--mono: "JetBrains Mono", "SFMono-Regular", Consolas, monospace;
|
|
--serif: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
--display: "Fraunces", Georgia, serif;
|
|
}
|
|
|
|
* {
|
|
box-sizing: border-box;
|
|
}
|
|
|
|
html {
|
|
scroll-behavior: smooth;
|
|
}
|
|
|
|
body {
|
|
margin: 0;
|
|
color: var(--ink);
|
|
background:
|
|
radial-gradient(circle at 18% 10%, rgba(124, 58, 237, 0.08), transparent 38%),
|
|
radial-gradient(circle at 88% 78%, rgba(245, 158, 11, 0.08), transparent 42%),
|
|
radial-gradient(circle at 58% 38%, rgba(236, 72, 153, 0.055), transparent 46%),
|
|
var(--paper);
|
|
font-family: var(--serif);
|
|
line-height: 1.7;
|
|
letter-spacing: 0;
|
|
}
|
|
|
|
a {
|
|
color: inherit;
|
|
text-decoration-thickness: 1px;
|
|
text-underline-offset: 4px;
|
|
}
|
|
|
|
img {
|
|
max-width: 100%;
|
|
display: block;
|
|
}
|
|
|
|
.reveal {
|
|
opacity: 0;
|
|
transform: translateY(40px);
|
|
transition:
|
|
opacity 0.8s cubic-bezier(0.16, 1, 0.3, 1),
|
|
transform 0.8s cubic-bezier(0.16, 1, 0.3, 1);
|
|
}
|
|
|
|
.reveal.visible {
|
|
opacity: 1;
|
|
transform: translateY(0);
|
|
}
|
|
|
|
@media (prefers-reduced-motion: reduce) {
|
|
.reveal {
|
|
opacity: 1;
|
|
transform: none;
|
|
transition: none;
|
|
}
|
|
}
|
|
|
|
.topbar {
|
|
position: fixed;
|
|
z-index: 100;
|
|
top: 0;
|
|
left: 0;
|
|
right: 0;
|
|
display: flex;
|
|
align-items: center;
|
|
justify-content: space-between;
|
|
gap: 24px;
|
|
padding: 20px 48px;
|
|
color: var(--ink);
|
|
background: rgba(255, 255, 255, 0.35);
|
|
border-bottom: 1px solid rgba(226, 232, 240, 0);
|
|
backdrop-filter: blur(16px) saturate(180%);
|
|
-webkit-backdrop-filter: blur(16px) saturate(180%);
|
|
font-family: var(--mono);
|
|
font-size: 0.78rem;
|
|
transition: background 0.3s, border-color 0.3s, box-shadow 0.3s, padding 0.3s;
|
|
}
|
|
|
|
.topbar.scrolled {
|
|
padding-top: 14px;
|
|
padding-bottom: 14px;
|
|
background: rgba(255, 255, 255, 0.66);
|
|
border-bottom: 1px solid rgba(226, 232, 240, 0.55);
|
|
box-shadow: 0 2px 16px rgba(15, 23, 42, 0.05);
|
|
}
|
|
|
|
.navbar-logos {
|
|
display: flex;
|
|
align-items: center;
|
|
gap: 22px;
|
|
}
|
|
|
|
.brandmark {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 2px;
|
|
color: #0f172a;
|
|
font-family: var(--display);
|
|
font-size: 1.45rem;
|
|
font-weight: 650;
|
|
letter-spacing: -0.015em;
|
|
text-decoration: none;
|
|
white-space: nowrap;
|
|
transition: opacity 0.25s ease;
|
|
}
|
|
|
|
.brandmark:hover {
|
|
opacity: 0.85;
|
|
}
|
|
|
|
.brandmark .brand-skill {
|
|
color: #0f172a;
|
|
}
|
|
|
|
.brandmark .brand-opt {
|
|
display: inline-block;
|
|
padding-right: 0.15em;
|
|
margin-right: -0.15em;
|
|
background: linear-gradient(110deg, #0ea5e9 0%, #4f46e5 25%, #7c3aed 50%, #ec4899 78%, #f59e0b 100%);
|
|
-webkit-background-clip: text;
|
|
background-clip: text;
|
|
-webkit-text-fill-color: transparent;
|
|
font-style: italic;
|
|
}
|
|
|
|
.microsoft-mark {
|
|
width: 16px;
|
|
height: 16px;
|
|
margin-right: 8px;
|
|
flex: 0 0 auto;
|
|
}
|
|
|
|
.navbar-divider {
|
|
width: 1px;
|
|
height: 30px;
|
|
background: #cbd5e1;
|
|
}
|
|
|
|
.navbar-related {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 8px;
|
|
padding: 5px 12px 5px 10px;
|
|
color: inherit;
|
|
background: rgba(255, 255, 255, 0.70);
|
|
border: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-radius: 999px;
|
|
text-decoration: none;
|
|
transition: transform 0.2s ease, box-shadow 0.2s ease, border-color 0.2s ease;
|
|
}
|
|
|
|
.navbar-related:hover {
|
|
color: inherit;
|
|
transform: translateY(-1px);
|
|
border-color: rgba(124, 58, 237, 0.42);
|
|
box-shadow: 0 4px 14px rgba(124, 58, 237, 0.14);
|
|
}
|
|
|
|
.navbar-related .nr-icon {
|
|
position: relative;
|
|
width: 16px;
|
|
height: 16px;
|
|
flex: 0 0 auto;
|
|
border-radius: 5px;
|
|
background: linear-gradient(135deg, #4f46e5, #7c3aed 56%, #ec4899);
|
|
}
|
|
|
|
.navbar-related .nr-icon::before {
|
|
content: "";
|
|
position: absolute;
|
|
top: 3px;
|
|
left: 3px;
|
|
width: 6px;
|
|
height: 6px;
|
|
border: 1.5px solid #ffffff;
|
|
border-radius: 999px;
|
|
}
|
|
|
|
.navbar-related .nr-icon::after {
|
|
content: "";
|
|
position: absolute;
|
|
right: 3px;
|
|
bottom: 4px;
|
|
width: 5px;
|
|
height: 1.5px;
|
|
background: #ffffff;
|
|
border-radius: 999px;
|
|
transform: rotate(45deg);
|
|
}
|
|
|
|
.navbar-related .nr-label {
|
|
display: block;
|
|
color: var(--violet);
|
|
font-family: var(--mono);
|
|
font-size: 0.58rem;
|
|
font-weight: 700;
|
|
letter-spacing: 0.12em;
|
|
line-height: 1;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.navbar-related .nr-name {
|
|
display: block;
|
|
margin-top: 2px;
|
|
font-family: var(--display);
|
|
font-size: 0.98rem;
|
|
font-weight: 750;
|
|
line-height: 1;
|
|
letter-spacing: -0.01em;
|
|
background: linear-gradient(135deg, #4f46e5, #ec4899);
|
|
-webkit-background-clip: text;
|
|
background-clip: text;
|
|
-webkit-text-fill-color: transparent;
|
|
}
|
|
|
|
.nav {
|
|
display: flex;
|
|
align-items: center;
|
|
justify-content: flex-end;
|
|
flex-wrap: wrap;
|
|
gap: 18px 28px;
|
|
}
|
|
|
|
.nav a {
|
|
color: var(--muted);
|
|
font-size: 0.92rem;
|
|
font-weight: 600;
|
|
text-decoration: none;
|
|
border-bottom: 1px solid transparent;
|
|
transition: color 0.2s, border-color 0.2s;
|
|
}
|
|
|
|
.nav a:hover {
|
|
color: var(--blue);
|
|
border-color: var(--red);
|
|
}
|
|
|
|
.hero {
|
|
position: relative;
|
|
min-height: 76vh;
|
|
display: grid;
|
|
align-items: center;
|
|
padding: 124px 28px 84px;
|
|
color: var(--ink);
|
|
overflow: hidden;
|
|
background:
|
|
radial-gradient(circle at 20% 18%, rgba(14, 165, 233, 0.13), transparent 32%),
|
|
radial-gradient(circle at 72% 24%, rgba(236, 72, 153, 0.12), transparent 34%),
|
|
linear-gradient(160deg, #eef2ff 0%, #f8f9fc 42%, #eef2ff 100%);
|
|
}
|
|
|
|
.hero-inner {
|
|
width: min(1080px, 100%);
|
|
margin: 0 auto;
|
|
display: grid;
|
|
grid-template-columns: minmax(0, 0.98fr) minmax(300px, 0.5fr);
|
|
gap: 36px;
|
|
align-items: center;
|
|
}
|
|
|
|
.kicker {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 10px;
|
|
width: fit-content;
|
|
padding: 7px 16px;
|
|
color: var(--violet);
|
|
background: rgba(124, 58, 237, 0.08);
|
|
border: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-radius: 999px;
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
font-weight: 600;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.hero h1 {
|
|
margin: 22px 0 12px;
|
|
font-family: var(--display);
|
|
font-size: clamp(3.8rem, 8vw, 6.7rem);
|
|
line-height: 0.92;
|
|
letter-spacing: -0.02em;
|
|
max-width: 820px;
|
|
background: linear-gradient(110deg, #0ea5e9 0%, #4f46e5 32%, #7c3aed 54%, #ec4899 78%, #f59e0b 100%);
|
|
-webkit-background-clip: text;
|
|
background-clip: text;
|
|
-webkit-text-fill-color: transparent;
|
|
}
|
|
|
|
.hero-subtitle {
|
|
max-width: 760px;
|
|
margin: 0;
|
|
color: #334155;
|
|
font-size: clamp(1.08rem, 1.7vw, 1.34rem);
|
|
line-height: 1.55;
|
|
font-weight: 500;
|
|
}
|
|
|
|
.hero-actions {
|
|
display: flex;
|
|
flex-wrap: wrap;
|
|
gap: 12px;
|
|
margin-top: 30px;
|
|
}
|
|
|
|
.button {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
justify-content: center;
|
|
gap: 8px;
|
|
min-height: 42px;
|
|
padding: 12px 22px;
|
|
font-family: var(--mono);
|
|
font-size: 0.78rem;
|
|
font-weight: 700;
|
|
text-decoration: none;
|
|
border: 1px solid currentColor;
|
|
border-radius: 999px;
|
|
box-shadow: 0 2px 8px rgba(15, 23, 42, 0.05);
|
|
transition: transform 180ms ease, background 180ms ease, color 180ms ease, box-shadow 180ms ease;
|
|
}
|
|
|
|
.button:hover {
|
|
transform: translateY(-2px);
|
|
box-shadow: 0 12px 24px rgba(124, 58, 237, 0.16);
|
|
}
|
|
|
|
.button.primary {
|
|
color: #ffffff;
|
|
background: linear-gradient(135deg, #4f46e5 0%, #ec4899 100%);
|
|
border-color: transparent;
|
|
}
|
|
|
|
.button.secondary {
|
|
color: var(--ink);
|
|
background: #ffffff;
|
|
border-color: var(--line);
|
|
}
|
|
|
|
.button.tertiary {
|
|
color: #ffffff;
|
|
background: var(--black);
|
|
border-color: var(--black);
|
|
}
|
|
|
|
.button-icon {
|
|
width: 16px;
|
|
height: 16px;
|
|
flex: 0 0 auto;
|
|
fill: currentColor;
|
|
}
|
|
|
|
.arxiv-icon {
|
|
width: 15px;
|
|
height: 20px;
|
|
object-fit: contain;
|
|
}
|
|
|
|
.related-project {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 14px;
|
|
max-width: 100%;
|
|
margin-top: 28px;
|
|
padding: 10px 22px 10px 14px;
|
|
color: inherit;
|
|
background: rgba(255, 255, 255, 0.72);
|
|
border: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-radius: 999px;
|
|
box-shadow: 0 2px 8px rgba(15, 23, 42, 0.05);
|
|
text-align: left;
|
|
text-decoration: none;
|
|
backdrop-filter: blur(8px);
|
|
transition: transform 180ms ease, border-color 180ms ease, box-shadow 180ms ease;
|
|
}
|
|
|
|
.related-project:hover {
|
|
color: inherit;
|
|
transform: translateY(-2px);
|
|
border-color: rgba(124, 58, 237, 0.42);
|
|
box-shadow: 0 12px 28px rgba(124, 58, 237, 0.14);
|
|
}
|
|
|
|
.related-icon {
|
|
position: relative;
|
|
flex: 0 0 auto;
|
|
width: 28px;
|
|
height: 28px;
|
|
border-radius: 10px;
|
|
background: linear-gradient(135deg, #4f46e5, #7c3aed 56%, #ec4899);
|
|
}
|
|
|
|
.related-icon::before {
|
|
content: "";
|
|
position: absolute;
|
|
top: 6px;
|
|
left: 6px;
|
|
width: 10px;
|
|
height: 10px;
|
|
border: 2px solid #ffffff;
|
|
border-radius: 999px;
|
|
}
|
|
|
|
.related-icon::after {
|
|
content: "";
|
|
position: absolute;
|
|
right: 7px;
|
|
bottom: 7px;
|
|
width: 8px;
|
|
height: 2px;
|
|
background: #ffffff;
|
|
border-radius: 999px;
|
|
transform: rotate(45deg);
|
|
transform-origin: center;
|
|
}
|
|
|
|
.related-text {
|
|
display: flex;
|
|
min-width: 0;
|
|
flex-direction: column;
|
|
gap: 2px;
|
|
}
|
|
|
|
.related-tag {
|
|
color: var(--violet);
|
|
font-family: var(--mono);
|
|
font-size: 0.62rem;
|
|
font-weight: 700;
|
|
letter-spacing: 0.14em;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.related-title {
|
|
color: var(--ink);
|
|
font-size: 0.98rem;
|
|
font-weight: 600;
|
|
line-height: 1.3;
|
|
}
|
|
|
|
.related-title strong {
|
|
font-family: var(--display);
|
|
font-size: 1.16rem;
|
|
font-weight: 700;
|
|
font-style: italic;
|
|
background: linear-gradient(135deg, #4f46e5, #ec4899);
|
|
-webkit-background-clip: text;
|
|
background-clip: text;
|
|
-webkit-text-fill-color: transparent;
|
|
}
|
|
|
|
.related-summary {
|
|
color: var(--muted);
|
|
font-size: 0.8rem;
|
|
line-height: 1.35;
|
|
}
|
|
|
|
.related-arrow {
|
|
flex: 0 0 auto;
|
|
color: var(--red);
|
|
font-size: 1.1rem;
|
|
transition: transform 180ms ease;
|
|
}
|
|
|
|
.related-project:hover .related-arrow {
|
|
transform: translateX(4px);
|
|
}
|
|
|
|
.hero-ledger {
|
|
position: relative;
|
|
overflow: hidden;
|
|
width: 100%;
|
|
padding: 24px;
|
|
color: var(--ink);
|
|
background: rgba(255, 255, 255, 0.78);
|
|
border: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-radius: 18px;
|
|
box-shadow: var(--shadow);
|
|
backdrop-filter: blur(10px);
|
|
}
|
|
|
|
.hero-ledger::before {
|
|
content: "";
|
|
position: absolute;
|
|
inset: 0;
|
|
background:
|
|
linear-gradient(90deg, rgba(124, 58, 237, 0.08) 1px, transparent 1px),
|
|
linear-gradient(0deg, rgba(124, 58, 237, 0.06) 1px, transparent 1px);
|
|
background-size: 26px 26px;
|
|
opacity: 0.34;
|
|
pointer-events: none;
|
|
}
|
|
|
|
.hero-ledger > * {
|
|
position: relative;
|
|
}
|
|
|
|
.ledger-kicker {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
min-height: 28px;
|
|
padding: 5px 11px;
|
|
color: var(--violet);
|
|
background: rgba(124, 58, 237, 0.10);
|
|
border: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-radius: 999px;
|
|
font-family: var(--mono);
|
|
font-size: 0.71rem;
|
|
font-weight: 800;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.ledger-hero {
|
|
display: flex;
|
|
align-items: baseline;
|
|
gap: 8px;
|
|
margin-top: 18px;
|
|
}
|
|
|
|
.ledger-value {
|
|
font-family: var(--display);
|
|
font-size: 5.85rem;
|
|
font-weight: 800;
|
|
line-height: 0.9;
|
|
white-space: nowrap;
|
|
color: var(--black);
|
|
}
|
|
|
|
.ledger-denominator {
|
|
color: var(--quiet);
|
|
font-family: var(--display);
|
|
font-size: 2.65rem;
|
|
font-weight: 800;
|
|
line-height: 1;
|
|
}
|
|
|
|
.ledger-copy {
|
|
max-width: 320px;
|
|
margin: 12px 0 0;
|
|
color: var(--muted);
|
|
font-size: 1.02rem;
|
|
line-height: 1.45;
|
|
}
|
|
|
|
.ledger-stats {
|
|
display: grid;
|
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
|
gap: 0;
|
|
margin-top: 22px;
|
|
border-top: 1px solid var(--line);
|
|
}
|
|
|
|
.ledger-stat {
|
|
padding-top: 14px;
|
|
}
|
|
|
|
.ledger-stat + .ledger-stat {
|
|
padding-left: 16px;
|
|
border-left: 1px solid var(--line);
|
|
}
|
|
|
|
.ledger-stat span {
|
|
display: block;
|
|
color: var(--quiet);
|
|
font-family: var(--mono);
|
|
font-size: 0.68rem;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.ledger-stat b {
|
|
display: block;
|
|
margin-top: 6px;
|
|
color: var(--violet);
|
|
font-family: var(--display);
|
|
font-size: 2.25rem;
|
|
line-height: 1;
|
|
}
|
|
|
|
.ledger-stat b.ledger-stat-text {
|
|
max-width: 8.5rem;
|
|
font-size: 1.08rem;
|
|
line-height: 1.12;
|
|
}
|
|
|
|
main {
|
|
width: min(1080px, calc(100% - 40px));
|
|
margin: 0 auto;
|
|
}
|
|
|
|
.section {
|
|
position: relative;
|
|
padding: 72px 0 4px;
|
|
}
|
|
|
|
.section-header {
|
|
position: relative;
|
|
display: grid;
|
|
grid-template-columns: minmax(200px, 0.42fr) minmax(0, 1fr);
|
|
gap: 48px;
|
|
align-items: start;
|
|
margin-bottom: 26px;
|
|
border-top: 0;
|
|
padding-top: 4px;
|
|
}
|
|
|
|
.section-header::before {
|
|
content: "";
|
|
position: absolute;
|
|
top: -18px;
|
|
left: 0;
|
|
width: 56px;
|
|
height: 4px;
|
|
border-radius: 999px;
|
|
background: linear-gradient(90deg, var(--blue), var(--red));
|
|
box-shadow: 0 6px 18px rgba(236, 72, 153, 0.24);
|
|
}
|
|
|
|
.section-eyebrow {
|
|
font-family: var(--mono);
|
|
color: var(--violet);
|
|
font-size: 0.76rem;
|
|
font-weight: 700;
|
|
letter-spacing: 0.12em;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
h2 {
|
|
margin: 0;
|
|
font-family: var(--display);
|
|
font-size: 2.55rem;
|
|
line-height: 1.04;
|
|
letter-spacing: -0.015em;
|
|
color: #0f172a;
|
|
}
|
|
|
|
.section-lede {
|
|
margin: 10px 0 0;
|
|
color: var(--muted);
|
|
font-size: 1.05rem;
|
|
max-width: 740px;
|
|
}
|
|
|
|
.manifesto {
|
|
display: grid;
|
|
grid-template-columns: 1.05fr 0.95fr;
|
|
gap: 18px;
|
|
align-items: stretch;
|
|
}
|
|
|
|
.statement {
|
|
padding: 30px;
|
|
background:
|
|
linear-gradient(135deg, rgba(79, 70, 229, 0.94), rgba(236, 72, 153, 0.90)),
|
|
var(--blue);
|
|
color: #ffffff;
|
|
border-radius: 16px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
.statement h3,
|
|
.panel h3 {
|
|
margin: 0 0 12px;
|
|
font-family: var(--display);
|
|
font-size: 1.45rem;
|
|
line-height: 1.12;
|
|
letter-spacing: 0;
|
|
}
|
|
|
|
.statement p {
|
|
margin: 0;
|
|
color: rgba(255, 255, 255, 0.84);
|
|
font-size: 1.04rem;
|
|
}
|
|
|
|
.chip-row {
|
|
display: flex;
|
|
flex-wrap: wrap;
|
|
gap: 9px;
|
|
margin-top: 24px;
|
|
}
|
|
|
|
.chip {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
min-height: 30px;
|
|
padding: 6px 11px;
|
|
color: #ffffff;
|
|
background: rgba(255, 255, 255, 0.16);
|
|
border: 1px solid rgba(255, 255, 255, 0.28);
|
|
border-radius: 999px;
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
font-weight: 600;
|
|
}
|
|
|
|
.steps {
|
|
display: grid;
|
|
grid-template-columns: repeat(2, minmax(0, 1fr));
|
|
gap: 10px;
|
|
}
|
|
|
|
.step {
|
|
min-height: 128px;
|
|
padding: 18px;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
box-shadow: 0 1px 4px rgba(15, 23, 42, 0.04);
|
|
transition: transform 180ms ease, border-color 180ms ease, box-shadow 180ms ease;
|
|
}
|
|
|
|
.step:hover,
|
|
.panel:hover,
|
|
.transfer:hover {
|
|
transform: translateY(-3px);
|
|
border-color: var(--line-strong);
|
|
box-shadow: 0 10px 28px rgba(124, 58, 237, 0.10);
|
|
}
|
|
|
|
.step strong {
|
|
display: block;
|
|
margin-bottom: 8px;
|
|
font-family: var(--mono);
|
|
color: var(--violet);
|
|
font-size: 0.78rem;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.step p {
|
|
margin: 0;
|
|
color: var(--muted);
|
|
font-size: 0.96rem;
|
|
}
|
|
|
|
.figure-frame {
|
|
margin-top: 22px;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 16px;
|
|
overflow: hidden;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
.figure-frame img {
|
|
width: 100%;
|
|
background: #ffffff;
|
|
}
|
|
|
|
.comparison-frame {
|
|
margin-top: 18px;
|
|
padding: 18px;
|
|
color: var(--ink);
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 16px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
.comparison-head {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
gap: 18px;
|
|
align-items: end;
|
|
margin-bottom: 16px;
|
|
}
|
|
|
|
.comparison-heading span {
|
|
color: var(--red);
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
font-weight: 700;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.comparison-heading h3 {
|
|
margin: 6px 0 0;
|
|
font-family: var(--display);
|
|
font-size: 2rem;
|
|
line-height: 1;
|
|
letter-spacing: 0;
|
|
}
|
|
|
|
.comparison-legend {
|
|
display: flex;
|
|
flex-wrap: wrap;
|
|
justify-content: flex-end;
|
|
gap: 8px;
|
|
max-width: 560px;
|
|
}
|
|
|
|
.legend-chip {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 7px;
|
|
min-height: 26px;
|
|
padding: 5px 8px;
|
|
color: var(--muted);
|
|
background: #f8fafc;
|
|
border: 1px solid var(--line);
|
|
border-radius: 999px;
|
|
font-family: var(--mono);
|
|
font-size: 0.67rem;
|
|
}
|
|
|
|
.legend-chip::before {
|
|
content: "";
|
|
width: 10px;
|
|
height: 10px;
|
|
background: var(--color);
|
|
border-radius: 3px;
|
|
}
|
|
|
|
.comparison-grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
|
gap: 12px;
|
|
}
|
|
|
|
.benchmark-panel {
|
|
min-width: 0;
|
|
padding: 14px;
|
|
background:
|
|
linear-gradient(180deg, rgba(255, 255, 255, 0.92), rgba(248, 250, 252, 0.70)),
|
|
#ffffff;
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
}
|
|
|
|
.benchmark-top {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
gap: 10px;
|
|
align-items: start;
|
|
margin-bottom: 10px;
|
|
}
|
|
|
|
.benchmark-top h4 {
|
|
margin: 0;
|
|
font-family: var(--display);
|
|
font-size: 1.28rem;
|
|
line-height: 1;
|
|
letter-spacing: 0;
|
|
}
|
|
|
|
.delta-pill {
|
|
flex: none;
|
|
padding: 5px 8px;
|
|
color: var(--green);
|
|
background: rgba(22, 163, 74, 0.10);
|
|
border: 1px solid rgba(22, 163, 74, 0.25);
|
|
border-radius: 999px;
|
|
font-family: var(--mono);
|
|
font-size: 0.67rem;
|
|
font-weight: 700;
|
|
white-space: nowrap;
|
|
}
|
|
|
|
.bar-stage {
|
|
position: relative;
|
|
display: flex;
|
|
align-items: flex-end;
|
|
gap: 6px;
|
|
height: 170px;
|
|
padding: 24px 8px 22px 34px;
|
|
background:
|
|
linear-gradient(rgba(124, 58, 237, 0.08) 1px, transparent 1px) 0 24px / 100% 42px,
|
|
rgba(238, 242, 255, 0.58);
|
|
border-left: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-bottom: 1px solid rgba(124, 58, 237, 0.20);
|
|
border-radius: 12px;
|
|
}
|
|
|
|
.axis-range {
|
|
position: absolute;
|
|
left: 7px;
|
|
bottom: 6px;
|
|
color: var(--quiet);
|
|
font-family: var(--mono);
|
|
font-size: 0.58rem;
|
|
writing-mode: vertical-rl;
|
|
transform: rotate(180deg);
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.method-bar {
|
|
position: relative;
|
|
flex: 1;
|
|
min-width: 0;
|
|
height: max(8px, var(--h));
|
|
background: var(--color);
|
|
border-radius: 4px 4px 2px 2px;
|
|
opacity: 0.86;
|
|
}
|
|
|
|
.method-bar.skillopt {
|
|
border: 2px solid rgba(15, 23, 42, 0.62);
|
|
box-shadow: 0 0 0 3px rgba(22, 163, 74, 0.14), 0 10px 18px rgba(22, 163, 74, 0.20);
|
|
opacity: 1;
|
|
}
|
|
|
|
.method-bar span {
|
|
position: absolute;
|
|
left: 50%;
|
|
bottom: calc(100% + 6px);
|
|
transform: translateX(-50%);
|
|
padding: 2px 5px;
|
|
color: #f8faf7;
|
|
background: var(--green);
|
|
border-radius: 999px;
|
|
font-family: var(--mono);
|
|
font-size: 0.62rem;
|
|
font-weight: 800;
|
|
white-space: nowrap;
|
|
}
|
|
|
|
.caption {
|
|
padding: 13px 16px;
|
|
color: var(--muted);
|
|
border-top: 1px solid var(--line);
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
line-height: 1.55;
|
|
}
|
|
|
|
.teaser-showcase {
|
|
position: relative;
|
|
margin-top: -28px;
|
|
padding: 22px;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 16px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
.video-showcase {
|
|
margin-top: -28px;
|
|
margin-bottom: 22px;
|
|
}
|
|
|
|
.video-frame {
|
|
margin: 18px 0 0;
|
|
padding: 14px;
|
|
background: #ffffff;
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
box-shadow: 0 1px 4px rgba(15, 23, 42, 0.04);
|
|
}
|
|
|
|
.video-frame iframe {
|
|
width: 100%;
|
|
aspect-ratio: 16 / 9;
|
|
display: block;
|
|
background: #0d1117;
|
|
border: 0;
|
|
border-radius: 10px;
|
|
}
|
|
|
|
.teaser-heading {
|
|
display: grid;
|
|
grid-template-columns: 160px 1fr;
|
|
gap: 20px;
|
|
align-items: start;
|
|
padding-bottom: 16px;
|
|
border-bottom: 1px solid var(--line);
|
|
}
|
|
|
|
.teaser-heading span {
|
|
color: var(--red);
|
|
font-family: var(--mono);
|
|
font-size: 0.75rem;
|
|
font-weight: 600;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.teaser-heading h2 {
|
|
font-size: 2.25rem;
|
|
}
|
|
|
|
.teaser-figure {
|
|
margin: 18px 0 0;
|
|
padding: 14px;
|
|
background: #ffffff;
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
overflow-x: auto;
|
|
}
|
|
|
|
.teaser-figure img {
|
|
width: 100%;
|
|
min-width: 760px;
|
|
height: auto;
|
|
}
|
|
|
|
.teaser-caption {
|
|
margin: 12px 0 0;
|
|
color: var(--muted);
|
|
font-family: var(--mono);
|
|
font-size: 0.73rem;
|
|
line-height: 1.55;
|
|
}
|
|
|
|
.table-wrap {
|
|
overflow-x: auto;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 16px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
table {
|
|
width: 100%;
|
|
border-collapse: collapse;
|
|
min-width: 1040px;
|
|
font-family: var(--mono);
|
|
font-size: 0.78rem;
|
|
line-height: 1.35;
|
|
}
|
|
|
|
th {
|
|
position: sticky;
|
|
top: 0;
|
|
z-index: 1;
|
|
padding: 12px 14px;
|
|
text-align: left;
|
|
color: #ffffff;
|
|
background: linear-gradient(135deg, #4f46e5, #7c3aed);
|
|
border-bottom: 1px solid var(--line-strong);
|
|
font-weight: 600;
|
|
}
|
|
|
|
td {
|
|
padding: 12px 14px;
|
|
border-bottom: 1px solid var(--line);
|
|
vertical-align: middle;
|
|
}
|
|
|
|
tr:last-child td {
|
|
border-bottom: 0;
|
|
}
|
|
|
|
tbody tr:nth-child(even) td {
|
|
background: #f8fafc;
|
|
}
|
|
|
|
.harness-group td {
|
|
border-top: 2px solid var(--line-strong);
|
|
}
|
|
|
|
.num {
|
|
text-align: right;
|
|
white-space: nowrap;
|
|
}
|
|
|
|
.model-cell {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 8px;
|
|
white-space: nowrap;
|
|
font-weight: 700;
|
|
}
|
|
|
|
.model-icon {
|
|
width: 20px;
|
|
height: 20px;
|
|
flex: 0 0 auto;
|
|
object-fit: contain;
|
|
border-radius: 5px;
|
|
background: #ffffff;
|
|
box-shadow: 0 0 0 1px rgba(226, 232, 240, 0.9);
|
|
}
|
|
|
|
.heat {
|
|
color: var(--ink);
|
|
background:
|
|
linear-gradient(90deg, rgba(22, 163, 74, 0.16) 0%, rgba(22, 163, 74, 0.16) calc(var(--heat) * 1%), transparent calc(var(--heat) * 1%)) !important;
|
|
font-weight: 600;
|
|
}
|
|
|
|
.heat-avg {
|
|
color: #4338ca;
|
|
background:
|
|
linear-gradient(135deg, rgba(79, 70, 229, 0.12), rgba(236, 72, 153, 0.10)),
|
|
#f8fafc !important;
|
|
font-weight: 700;
|
|
}
|
|
|
|
.method-grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(4, minmax(0, 1fr));
|
|
gap: 12px;
|
|
}
|
|
|
|
.panel {
|
|
padding: 22px;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
transition: transform 180ms ease, border-color 180ms ease, box-shadow 180ms ease;
|
|
}
|
|
|
|
.panel.accent-blue {
|
|
border-top: 4px solid var(--blue);
|
|
}
|
|
|
|
.panel.accent-red {
|
|
border-top: 4px solid var(--red);
|
|
}
|
|
|
|
.panel.accent-gold {
|
|
border-top: 4px solid var(--gold);
|
|
}
|
|
|
|
.panel.accent-green {
|
|
border-top: 4px solid var(--green);
|
|
}
|
|
|
|
.panel p {
|
|
margin: 0;
|
|
color: var(--muted);
|
|
font-size: 0.96rem;
|
|
}
|
|
|
|
.callout {
|
|
margin-top: 18px;
|
|
padding: 18px 20px;
|
|
color: var(--ink);
|
|
background: var(--panel-warm);
|
|
border: 1px solid rgba(245, 158, 11, 0.24);
|
|
border-left: 6px solid var(--gold);
|
|
border-radius: 14px;
|
|
font-size: 1rem;
|
|
}
|
|
|
|
.ablation-layout {
|
|
display: grid;
|
|
gap: 16px;
|
|
}
|
|
|
|
.ablation-layout table {
|
|
min-width: 720px;
|
|
}
|
|
|
|
.ablation-summary .mini-list {
|
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
|
}
|
|
|
|
.evolution-shell {
|
|
display: grid;
|
|
grid-template-columns: minmax(0, 1.42fr) minmax(300px, 0.58fr);
|
|
gap: 16px;
|
|
align-items: stretch;
|
|
min-height: 520px;
|
|
}
|
|
|
|
.evolution-chart {
|
|
display: flex;
|
|
flex-direction: column;
|
|
min-height: 520px;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 16px;
|
|
box-shadow: var(--shadow);
|
|
overflow: hidden;
|
|
}
|
|
|
|
.chart-toolbar {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
gap: 14px;
|
|
padding: 16px 18px 12px;
|
|
border-bottom: 1px solid var(--line);
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
color: var(--muted);
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.chart-legend {
|
|
display: flex;
|
|
flex-wrap: wrap;
|
|
gap: 9px 14px;
|
|
}
|
|
|
|
.legend-item {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 7px;
|
|
white-space: nowrap;
|
|
}
|
|
|
|
.legend-item::before {
|
|
content: "";
|
|
width: 22px;
|
|
height: 3px;
|
|
background: var(--legend);
|
|
border-radius: 999px;
|
|
}
|
|
|
|
.chart-scroller {
|
|
flex: 1;
|
|
overflow-x: auto;
|
|
padding: 10px 14px 0;
|
|
}
|
|
|
|
.skill-chart {
|
|
width: 100%;
|
|
min-width: 760px;
|
|
height: 100%;
|
|
min-height: 390px;
|
|
display: block;
|
|
font-family: var(--mono);
|
|
}
|
|
|
|
.chart-grid {
|
|
stroke: rgba(124, 58, 237, 0.12);
|
|
stroke-width: 1;
|
|
}
|
|
|
|
.chart-axis {
|
|
stroke: rgba(79, 70, 229, 0.32);
|
|
stroke-width: 1.2;
|
|
}
|
|
|
|
.chart-label {
|
|
fill: var(--quiet);
|
|
font-size: 11px;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.line-train,
|
|
.line-selection {
|
|
fill: none;
|
|
stroke-linecap: round;
|
|
stroke-linejoin: round;
|
|
stroke-width: 4;
|
|
vector-effect: non-scaling-stroke;
|
|
}
|
|
|
|
.line-train {
|
|
stroke: var(--teal);
|
|
}
|
|
|
|
.line-selection {
|
|
stroke: var(--blue);
|
|
}
|
|
|
|
.chart-point {
|
|
cursor: pointer;
|
|
outline: none;
|
|
}
|
|
|
|
.chart-point circle:not(.hit) {
|
|
fill: var(--panel);
|
|
stroke-width: 3;
|
|
transition: r 140ms ease, fill 140ms ease, stroke-width 140ms ease;
|
|
vector-effect: non-scaling-stroke;
|
|
}
|
|
|
|
.chart-point .hit {
|
|
fill: transparent;
|
|
stroke: transparent;
|
|
stroke-width: 26;
|
|
}
|
|
|
|
.chart-point[data-state="accepted"] circle:not(.hit) {
|
|
stroke: var(--green);
|
|
}
|
|
|
|
.chart-point[data-state="rejected"] circle:not(.hit) {
|
|
stroke: var(--red);
|
|
}
|
|
|
|
.chart-point[data-state="slow"] circle:not(.hit) {
|
|
stroke: var(--gold);
|
|
}
|
|
|
|
.chart-point[data-state="baseline"] circle:not(.hit) {
|
|
stroke: var(--line-strong);
|
|
}
|
|
|
|
.chart-point.is-active circle:not(.hit),
|
|
.chart-point:hover circle:not(.hit),
|
|
.chart-point:focus circle:not(.hit) {
|
|
r: 7;
|
|
fill: #fff7ed;
|
|
stroke-width: 4;
|
|
}
|
|
|
|
.chart-caption {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
gap: 14px;
|
|
padding: 12px 18px 16px;
|
|
color: var(--muted);
|
|
border-top: 1px solid var(--line);
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
line-height: 1.55;
|
|
}
|
|
|
|
.evolution-detail {
|
|
display: flex;
|
|
flex-direction: column;
|
|
min-height: 438px;
|
|
height: 100%;
|
|
padding: 20px;
|
|
color: var(--ink);
|
|
background: #ffffff;
|
|
border: 1px solid var(--line);
|
|
border-radius: 16px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
.detail-kicker {
|
|
display: flex;
|
|
align-items: center;
|
|
justify-content: space-between;
|
|
gap: 10px;
|
|
margin-bottom: 14px;
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
color: var(--quiet);
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.detail-badge {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
min-height: 26px;
|
|
padding: 5px 8px;
|
|
color: var(--violet);
|
|
background: rgba(124, 58, 237, 0.10);
|
|
border: 1px solid rgba(124, 58, 237, 0.18);
|
|
border-radius: 999px;
|
|
font-weight: 600;
|
|
white-space: nowrap;
|
|
}
|
|
|
|
.evolution-detail h3 {
|
|
margin: 0 0 14px;
|
|
font-family: var(--display);
|
|
font-size: 1.9rem;
|
|
line-height: 1;
|
|
letter-spacing: 0;
|
|
}
|
|
|
|
.detail-metrics {
|
|
display: grid;
|
|
grid-template-columns: repeat(2, minmax(0, 1fr));
|
|
gap: 10px;
|
|
margin: 0 0 16px;
|
|
}
|
|
|
|
.detail-metric {
|
|
padding: 12px;
|
|
background: #f8fafc;
|
|
border: 1px solid var(--line);
|
|
border-radius: 12px;
|
|
}
|
|
|
|
.detail-metric span {
|
|
display: block;
|
|
color: var(--quiet);
|
|
font-family: var(--mono);
|
|
font-size: 0.67rem;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.detail-metric b {
|
|
display: block;
|
|
margin-top: 4px;
|
|
font-family: var(--display);
|
|
font-size: 1.62rem;
|
|
line-height: 1;
|
|
}
|
|
|
|
.detail-summary {
|
|
margin: 0 0 14px;
|
|
color: var(--muted);
|
|
font-size: 0.96rem;
|
|
}
|
|
|
|
.detail-edits {
|
|
display: grid;
|
|
gap: 9px;
|
|
margin: 0;
|
|
padding: 0;
|
|
list-style: none;
|
|
overflow-y: auto;
|
|
min-height: 150px;
|
|
max-height: 184px;
|
|
padding-right: 4px;
|
|
}
|
|
|
|
.detail-edits li {
|
|
padding: 10px 11px;
|
|
color: var(--muted);
|
|
background: rgba(238, 242, 255, 0.58);
|
|
border-left: 4px solid var(--violet);
|
|
border-radius: 10px;
|
|
font-size: 0.92rem;
|
|
line-height: 1.42;
|
|
}
|
|
|
|
.evolution-footnotes {
|
|
display: grid;
|
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
|
gap: 12px;
|
|
margin-top: 16px;
|
|
}
|
|
|
|
.evolution-note {
|
|
padding: 14px;
|
|
background: var(--panel);
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
font-family: var(--mono);
|
|
font-size: 0.72rem;
|
|
color: var(--muted);
|
|
line-height: 1.5;
|
|
}
|
|
|
|
.evolution-note b {
|
|
display: block;
|
|
margin-bottom: 5px;
|
|
color: var(--ink);
|
|
font-size: 0.82rem;
|
|
}
|
|
|
|
.mini-list {
|
|
display: grid;
|
|
gap: 10px;
|
|
margin-top: 16px;
|
|
}
|
|
|
|
.mini-item {
|
|
display: grid;
|
|
grid-template-columns: 96px 1fr;
|
|
gap: 14px;
|
|
padding: 13px;
|
|
background: rgba(255, 255, 255, 0.7);
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
}
|
|
|
|
.mini-item b {
|
|
color: var(--red);
|
|
font-family: var(--mono);
|
|
font-size: 0.76rem;
|
|
text-transform: uppercase;
|
|
}
|
|
|
|
.mini-item span {
|
|
color: var(--muted);
|
|
}
|
|
|
|
.transfer-grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(4, minmax(0, 1fr));
|
|
gap: 12px;
|
|
}
|
|
|
|
.transfer {
|
|
padding: 18px;
|
|
color: var(--ink);
|
|
background: #ffffff;
|
|
border: 1px solid var(--line);
|
|
border-radius: 14px;
|
|
min-height: 160px;
|
|
box-shadow: 0 1px 4px rgba(15, 23, 42, 0.04);
|
|
transition: transform 180ms ease, border-color 180ms ease, box-shadow 180ms ease;
|
|
}
|
|
|
|
.transfer:nth-child(2) {
|
|
background: #ffffff;
|
|
}
|
|
|
|
.transfer:nth-child(3) {
|
|
background: #ffffff;
|
|
}
|
|
|
|
.transfer:nth-child(4) {
|
|
background: #ffffff;
|
|
}
|
|
|
|
.transfer .big {
|
|
display: block;
|
|
margin: 8px 0;
|
|
font-family: var(--display);
|
|
font-size: 2.15rem;
|
|
font-weight: 800;
|
|
line-height: 1;
|
|
background: linear-gradient(135deg, #4f46e5, #ec4899);
|
|
-webkit-background-clip: text;
|
|
background-clip: text;
|
|
-webkit-text-fill-color: transparent;
|
|
}
|
|
|
|
.transfer p {
|
|
margin: 0;
|
|
color: var(--muted);
|
|
font-size: 0.92rem;
|
|
}
|
|
|
|
.bibtex-box {
|
|
position: relative;
|
|
overflow-x: auto;
|
|
margin-top: 18px;
|
|
padding: 22px 24px;
|
|
color: #94a3b8;
|
|
background: #1e293b;
|
|
border: 1px solid #334155;
|
|
border-radius: 12px;
|
|
box-shadow: 0 18px 44px rgba(15, 23, 42, 0.16);
|
|
}
|
|
|
|
.bibtex-box pre {
|
|
margin: 0;
|
|
}
|
|
|
|
.bibtex-box code {
|
|
font-family: var(--mono);
|
|
font-size: 0.82rem;
|
|
line-height: 1.6;
|
|
white-space: pre;
|
|
}
|
|
|
|
.copy-btn {
|
|
position: absolute;
|
|
top: 12px;
|
|
right: 12px;
|
|
padding: 6px 14px;
|
|
color: #a5b4fc;
|
|
background: rgba(124, 58, 237, 0.20);
|
|
border: 1px solid rgba(124, 58, 237, 0.30);
|
|
border-radius: 6px;
|
|
font-family: var(--mono);
|
|
font-size: 0.78rem;
|
|
font-weight: 600;
|
|
cursor: pointer;
|
|
transition: background 0.2s ease, border-color 0.2s ease, color 0.2s ease;
|
|
}
|
|
|
|
.copy-btn:hover {
|
|
background: rgba(124, 58, 237, 0.35);
|
|
}
|
|
|
|
.copy-btn.copied {
|
|
color: #86efac;
|
|
background: rgba(34, 197, 94, 0.20);
|
|
border-color: rgba(34, 197, 94, 0.30);
|
|
}
|
|
|
|
.footer {
|
|
margin-top: 80px;
|
|
padding: 32px 0 44px;
|
|
border-top: 1px solid var(--line);
|
|
color: var(--muted);
|
|
font-family: var(--mono);
|
|
font-size: 0.75rem;
|
|
display: flex;
|
|
justify-content: space-between;
|
|
gap: 18px;
|
|
flex-wrap: wrap;
|
|
}
|
|
|
|
.footer a {
|
|
color: inherit;
|
|
text-decoration-color: var(--line-strong);
|
|
text-underline-offset: 3px;
|
|
}
|
|
|
|
@media (max-width: 980px) {
|
|
.topbar {
|
|
padding: 12px 18px;
|
|
gap: 16px;
|
|
}
|
|
|
|
.navbar-logos {
|
|
gap: 14px;
|
|
}
|
|
|
|
.navbar-related {
|
|
padding-right: 10px;
|
|
}
|
|
|
|
.nav a {
|
|
color: var(--muted);
|
|
font-size: 0.85rem;
|
|
}
|
|
|
|
.hero {
|
|
min-height: auto;
|
|
padding-top: 126px;
|
|
}
|
|
|
|
.hero-inner,
|
|
.manifesto,
|
|
.teaser-heading,
|
|
.section-header,
|
|
.evolution-shell {
|
|
grid-template-columns: 1fr;
|
|
}
|
|
|
|
.comparison-head {
|
|
align-items: flex-start;
|
|
flex-direction: column;
|
|
}
|
|
|
|
.comparison-legend {
|
|
justify-content: flex-start;
|
|
}
|
|
|
|
.comparison-grid {
|
|
grid-template-columns: repeat(2, minmax(0, 1fr));
|
|
}
|
|
|
|
.hero h1 {
|
|
font-size: 4.1rem;
|
|
}
|
|
|
|
.method-grid,
|
|
.transfer-grid,
|
|
.evolution-footnotes {
|
|
grid-template-columns: repeat(2, minmax(0, 1fr));
|
|
}
|
|
}
|
|
|
|
@media (max-width: 680px) {
|
|
main {
|
|
width: min(100% - 24px, 1160px);
|
|
}
|
|
|
|
.topbar {
|
|
padding: 12px;
|
|
align-items: flex-start;
|
|
flex-direction: column;
|
|
position: static;
|
|
background: rgba(255, 255, 255, 0.82);
|
|
}
|
|
|
|
.navbar-logos {
|
|
width: 100%;
|
|
flex-wrap: wrap;
|
|
}
|
|
|
|
.navbar-divider {
|
|
display: none;
|
|
}
|
|
|
|
.nav {
|
|
justify-content: flex-start;
|
|
}
|
|
|
|
.hero {
|
|
padding: 40px 12px 34px;
|
|
}
|
|
|
|
.hero h1 {
|
|
font-size: 3.1rem;
|
|
}
|
|
|
|
.hero-subtitle {
|
|
font-size: 1.08rem;
|
|
}
|
|
|
|
h2 {
|
|
font-size: 2rem;
|
|
}
|
|
|
|
.method-grid,
|
|
.ablation-summary .mini-list,
|
|
.comparison-grid,
|
|
.transfer-grid,
|
|
.evolution-footnotes,
|
|
.steps {
|
|
grid-template-columns: 1fr;
|
|
}
|
|
|
|
.mini-item,
|
|
.detail-metrics,
|
|
.chart-caption {
|
|
grid-template-columns: 1fr;
|
|
}
|
|
|
|
.ledger-value {
|
|
font-size: 4.35rem;
|
|
}
|
|
|
|
.ledger-denominator {
|
|
font-size: 2.05rem;
|
|
}
|
|
|
|
.ledger-stats {
|
|
grid-template-columns: 1fr;
|
|
}
|
|
|
|
.ledger-stat + .ledger-stat {
|
|
padding-left: 0;
|
|
border-left: 0;
|
|
}
|
|
|
|
.bar-stage {
|
|
height: 150px;
|
|
}
|
|
|
|
.chart-toolbar,
|
|
.chart-caption {
|
|
flex-direction: column;
|
|
}
|
|
|
|
.teaser-showcase {
|
|
margin-top: 12px;
|
|
padding: 12px;
|
|
}
|
|
|
|
.teaser-figure {
|
|
padding: 8px;
|
|
}
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<header class="topbar" id="navbar" aria-label="Page navigation">
|
|
<div class="navbar-logos">
|
|
<a class="brandmark" href="#top" aria-label="SkillOpt home">
|
|
<svg class="microsoft-mark" viewBox="0 0 23 23" aria-hidden="true">
|
|
<rect width="10" height="10" fill="#F25022"></rect>
|
|
<rect x="13" width="10" height="10" fill="#7FBA00"></rect>
|
|
<rect y="13" width="10" height="10" fill="#00A4EF"></rect>
|
|
<rect x="13" y="13" width="10" height="10" fill="#FFB900"></rect>
|
|
</svg>
|
|
<span class="brand-skill">Skill</span><span class="brand-opt">Opt</span>
|
|
</a>
|
|
<div class="navbar-divider" aria-hidden="true"></div>
|
|
<a class="navbar-related" href="https://microsoft.github.io/SkillLens/" target="_blank" rel="noopener" title="Companion project: SkillLens">
|
|
<span class="nr-icon" aria-hidden="true"></span>
|
|
<span>
|
|
<span class="nr-label">Related</span>
|
|
<span class="nr-name">SkillLens</span>
|
|
</span>
|
|
</a>
|
|
</div>
|
|
<nav class="nav" aria-label="Sections">
|
|
<a href="#idea">Idea</a>
|
|
<a href="#method">Method</a>
|
|
<a href="#results">Results</a>
|
|
<a href="#ablations">Ablations</a>
|
|
<a href="#evolution">Evolution</a>
|
|
<a href="#transfer">Transfer</a>
|
|
<a href="#citation">Citation</a>
|
|
<a href="https://github.com/microsoft/SkillOpt" target="_blank" rel="noopener">Code</a>
|
|
</nav>
|
|
</header>
|
|
|
|
<section class="hero" id="top">
|
|
<div class="hero-inner">
|
|
<div>
|
|
<span class="kicker">Text-space optimization for frozen agents</span>
|
|
<h1>SkillOpt</h1>
|
|
<p class="hero-subtitle">
|
|
Executive Strategy for Self-Evolving Agent Skills. SkillOpt treats a compact
|
|
natural-language skill document as the trainable state of a frozen language
|
|
agent, then learns that document through rollouts, reflection, bounded edits,
|
|
and held-out validation gates.
|
|
</p>
|
|
<div class="hero-actions" aria-label="Primary links">
|
|
<a class="button primary" href="#idea">Core Idea</a>
|
|
<a class="button secondary" href="#method">Method</a>
|
|
<a class="button secondary" href="#results">View Results</a>
|
|
<a class="button tertiary" href="https://github.com/microsoft/SkillOpt" target="_blank" rel="noopener">
|
|
<svg class="button-icon" viewBox="0 0 16 16" aria-hidden="true">
|
|
<path d="M8 0C3.58 0 0 3.67 0 8.2c0 3.62 2.29 6.69 5.47 7.78.4.08.55-.18.55-.4 0-.2-.01-.86-.01-1.56-2.01.38-2.53-.5-2.69-.95-.09-.23-.48-.95-.82-1.14-.28-.15-.68-.52-.01-.53.63-.01 1.08.59 1.23.83.72 1.24 1.87.89 2.33.68.07-.53.28-.89.51-1.09-1.78-.21-3.64-.91-3.64-4.03 0-.89.31-1.62.82-2.19-.08-.21-.36-1.04.08-2.16 0 0 .67-.22 2.2.84A7.42 7.42 0 0 1 8 4.01c.68 0 1.36.09 2 .27 1.53-1.06 2.2-.84 2.2-.84.44 1.12.16 1.95.08 2.16.51.57.82 1.3.82 2.19 0 3.13-1.87 3.82-3.65 4.03.29.26.54.75.54 1.51 0 1.09-.01 1.97-.01 2.24 0 .22.15.48.55.4A8.1 8.1 0 0 0 16 8.2C16 3.67 12.42 0 8 0Z"/>
|
|
</svg>
|
|
Code Repo
|
|
</a>
|
|
<a class="button secondary" href="https://arxiv.org/abs/2605.23904" target="_blank" rel="noopener" aria-label="arXiv paper">
|
|
<img class="button-icon arxiv-icon" src="skillopt-assets/arxiv-logomark-small.svg" alt="" aria-hidden="true">
|
|
Paper
|
|
</a>
|
|
<a class="button secondary" href="https://youtu.be/JUBMDTCiM0M" target="_blank" rel="noopener">
|
|
<svg class="button-icon" viewBox="0 0 16 16" aria-hidden="true">
|
|
<path d="M14.7 4.1a1.9 1.9 0 0 0-1.34-1.34C12.18 2.44 8 2.44 8 2.44s-4.18 0-5.36.32A1.9 1.9 0 0 0 1.3 4.1 19.8 19.8 0 0 0 .99 7.75c0 1.28.11 2.55.31 3.65a1.9 1.9 0 0 0 1.34 1.34c1.18.32 5.36.32 5.36.32s4.18 0 5.36-.32a1.9 1.9 0 0 0 1.34-1.34c.2-1.1.31-2.37.31-3.65 0-1.28-.11-2.55-.31-3.65ZM6.6 10.04V5.46l3.9 2.29-3.9 2.29Z"/>
|
|
</svg>
|
|
Video
|
|
</a>
|
|
</div>
|
|
<a class="related-project" href="https://microsoft.github.io/SkillLens/" target="_blank" rel="noopener" aria-label="Open the SkillLens project page">
|
|
<span class="related-icon" aria-hidden="true"></span>
|
|
<span class="related-text">
|
|
<span class="related-tag">Related project</span>
|
|
<span class="related-title"><strong>SkillLens</strong> studies model-generated agent skills.</span>
|
|
<span class="related-summary">A companion project page from Microsoft Research.</span>
|
|
</span>
|
|
<span class="related-arrow" aria-hidden="true">-></span>
|
|
</a>
|
|
</div>
|
|
|
|
<aside class="hero-ledger" aria-label="Key result summary">
|
|
<span class="ledger-kicker">Main result</span>
|
|
<div class="ledger-hero" aria-label="52 out of 52 settings">
|
|
<span class="ledger-value">52</span>
|
|
<span class="ledger-denominator">/52</span>
|
|
</div>
|
|
<p class="ledger-copy">
|
|
Best or tied-best in every model x benchmark and harness x benchmark setting.
|
|
</p>
|
|
<div class="ledger-stats" aria-label="Evaluation coverage">
|
|
<div class="ledger-stat">
|
|
<span>Target models</span>
|
|
<b>7</b>
|
|
</div>
|
|
<div class="ledger-stat">
|
|
<span>Benchmarks</span>
|
|
<b>6</b>
|
|
</div>
|
|
<div class="ledger-stat">
|
|
<span>Harnesses</span>
|
|
<b class="ledger-stat-text">Codex + Claude Code</b>
|
|
</div>
|
|
</div>
|
|
</aside>
|
|
</div>
|
|
</section>
|
|
|
|
<main>
|
|
<section class="teaser-showcase video-showcase" aria-labelledby="video-title">
|
|
<div class="teaser-heading">
|
|
<span>Project Video</span>
|
|
<div>
|
|
<h2 id="video-title">SkillOpt in motion.</h2>
|
|
<p class="section-lede">
|
|
A short visual overview of how SkillOpt treats natural-language skills
|
|
as trainable artifacts: roll out, reflect, edit, validate, and export.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
<figure class="video-frame">
|
|
<iframe
|
|
src="https://www.youtube.com/embed/JUBMDTCiM0M"
|
|
title="SkillOpt project video"
|
|
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
|
|
allowfullscreen>
|
|
</iframe>
|
|
</figure>
|
|
<p class="teaser-caption">
|
|
Promotional video for the SkillOpt project page. The static paper teaser is shown below for high-resolution inspection.
|
|
</p>
|
|
</section>
|
|
|
|
<section class="teaser-showcase" aria-labelledby="teaser-title">
|
|
<div class="teaser-heading">
|
|
<span>Paper Teaser</span>
|
|
<div>
|
|
<h2 id="teaser-title">The core loop at a glance.</h2>
|
|
<p class="section-lede">
|
|
The teaser summarizes the SkillOpt training loop: rollout evidence,
|
|
optimizer-side reflection, bounded skill edits, validation gating,
|
|
and the exported reusable skill.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
<figure class="teaser-figure">
|
|
<img src="skillopt-assets/teaser-1.png" alt="SkillOpt teaser figure showing the target model, optimizer model, bounded edits, validation gate, and exported best skill.">
|
|
</figure>
|
|
<p class="teaser-caption">
|
|
Figure from the SkillOpt paper. On small screens, the figure area scrolls horizontally to preserve the original details.
|
|
</p>
|
|
</section>
|
|
|
|
<section class="section" id="idea">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">01 / Core Idea</div>
|
|
<div>
|
|
<h2>Train the procedure, not the weights.</h2>
|
|
<p class="section-lede">
|
|
SkillOpt makes the skill document itself the optimization target. The
|
|
target model, backend, and harness stay fixed; the procedure that guides
|
|
evidence gathering, tool use, verification, and output formatting evolves.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="manifesto">
|
|
<article class="statement">
|
|
<h3>A skill is external state for an agent.</h3>
|
|
<p>
|
|
Instead of fine-tuning a model or hand-maintaining prompts, SkillOpt runs
|
|
the frozen agent on scored batches, asks a separate optimizer model to
|
|
propose structured edits, and accepts a candidate only when validation
|
|
performance improves.
|
|
</p>
|
|
<div class="chip-row">
|
|
<span class="chip">Frozen target model</span>
|
|
<span class="chip">Optimizer model</span>
|
|
<span class="chip">Add / delete / replace edits</span>
|
|
<span class="chip">Held-out gate</span>
|
|
</div>
|
|
</article>
|
|
|
|
<div class="steps" aria-label="Optimization loop summary">
|
|
<div class="step">
|
|
<strong>Rollout</strong>
|
|
<p>The target model executes tasks with the current skill and records scored trajectories.</p>
|
|
</div>
|
|
<div class="step">
|
|
<strong>Reflect</strong>
|
|
<p>The optimizer analyzes success and failure minibatches to find reusable procedures.</p>
|
|
</div>
|
|
<div class="step">
|
|
<strong>Edit</strong>
|
|
<p>Candidate add, delete, and replace operations are merged and ranked under a budget.</p>
|
|
</div>
|
|
<div class="step">
|
|
<strong>Gate</strong>
|
|
<p>The candidate skill is kept only if it improves held-out selection performance.</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
|
|
<section class="section" id="method">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">02 / Method</div>
|
|
<div>
|
|
<h2>A training loop for natural-language skills.</h2>
|
|
<p class="section-lede">
|
|
The loop deliberately mirrors a learning algorithm: rollout evidence acts
|
|
like a forward pass, reflection acts like a language-level backward pass,
|
|
and the textual learning rate bounds how far the skill can move.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="method-grid">
|
|
<article class="panel accent-blue">
|
|
<h3>Evidence</h3>
|
|
<p>Rollout batches capture messages, tool calls, verifier feedback, task metadata, and final scores.</p>
|
|
</article>
|
|
<article class="panel accent-red">
|
|
<h3>Minibatches</h3>
|
|
<p>Failures and successes are reflected separately so edits correct recurring errors while preserving working behavior.</p>
|
|
</article>
|
|
<article class="panel accent-gold">
|
|
<h3>Bounded Edits</h3>
|
|
<p>An edit budget functions as a textual learning rate, preventing useful rules from being overwritten by broad rewrites.</p>
|
|
</article>
|
|
<article class="panel accent-green">
|
|
<h3>Memory</h3>
|
|
<p>Rejected edits, slow update, and optimizer-side meta skill provide longer-horizon feedback without bloating deployment.</p>
|
|
</article>
|
|
</div>
|
|
|
|
<figure class="figure-frame">
|
|
<img src="skillopt-assets/pipeline-1.png" alt="SkillOpt pipeline showing rollout, reflection, bounded edits, validation gate, slow update, and meta skill.">
|
|
<figcaption class="caption">
|
|
SkillOpt pipeline from the paper. The frozen target model executes with the current skill; the optimizer model proposes bounded edits; held-out validation decides whether the candidate becomes the new current skill.
|
|
</figcaption>
|
|
</figure>
|
|
</section>
|
|
|
|
<section class="section" id="results">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">03 / Main Results</div>
|
|
<div>
|
|
<h2>SkillOpt improves GPT and Qwen target models.</h2>
|
|
<p class="section-lede">
|
|
The table reports main-result gains across target models and
|
|
execution harnesses, comparing no-skill execution with the final
|
|
SkillOpt skill on held-out test splits.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="table-wrap">
|
|
<table aria-label="Main result gain heatmap by model, harness, and benchmark">
|
|
<thead>
|
|
<tr>
|
|
<th>Target model</th>
|
|
<th>Harness</th>
|
|
<th class="num">SearchQA</th>
|
|
<th class="num">Sheet</th>
|
|
<th class="num">Office</th>
|
|
<th class="num">DocVQA</th>
|
|
<th class="num">LiveMath</th>
|
|
<th class="num">ALFWorld</th>
|
|
<th class="num">Avg gain</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.5</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 19;">+9.6</td>
|
|
<td class="num heat" style="--heat: 77;">+38.9</td>
|
|
<td class="num heat" style="--heat: 77;">+39.0</td>
|
|
<td class="num heat" style="--heat: 24;">+12.4</td>
|
|
<td class="num heat" style="--heat: 58;">+29.3</td>
|
|
<td class="num heat" style="--heat: 23;">+11.9</td>
|
|
<td class="num heat-avg">+23.5</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.4</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 12;">+6.2</td>
|
|
<td class="num heat" style="--heat: 42;">+21.1</td>
|
|
<td class="num heat" style="--heat: 25;">+12.8</td>
|
|
<td class="num heat" style="--heat: 27;">+13.6</td>
|
|
<td class="num heat" style="--heat: 14;">+7.2</td>
|
|
<td class="num heat" style="--heat: 31;">+15.6</td>
|
|
<td class="num heat-avg">+12.8</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.4-mini</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 8;">+4.3</td>
|
|
<td class="num heat" style="--heat: 22;">+11.4</td>
|
|
<td class="num heat" style="--heat: 53;">+26.7</td>
|
|
<td class="num heat" style="--heat: 33;">+16.5</td>
|
|
<td class="num heat" style="--heat: 9;">+4.8</td>
|
|
<td class="num heat" style="--heat: 25;">+12.7</td>
|
|
<td class="num heat-avg">+12.7</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.4-nano</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 37;">+19.0</td>
|
|
<td class="num heat" style="--heat: 16;">+8.2</td>
|
|
<td class="num heat" style="--heat: 66;">+33.7</td>
|
|
<td class="num heat" style="--heat: 97;">+49.4</td>
|
|
<td class="num heat" style="--heat: 8;">+4.0</td>
|
|
<td class="num heat" style="--heat: 69;">+35.1</td>
|
|
<td class="num heat-avg">+24.9</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.2</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 22;">+11.2</td>
|
|
<td class="num heat" style="--heat: 37;">+18.9</td>
|
|
<td class="num heat" style="--heat: 42;">+21.5</td>
|
|
<td class="num heat" style="--heat: 33;">+16.5</td>
|
|
<td class="num heat" style="--heat: 30;">+15.2</td>
|
|
<td class="num heat" style="--heat: 32;">+16.4</td>
|
|
<td class="num heat-avg">+16.6</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/qwen-color.png" alt="Qwen logo">Qwen3.5-4B</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 6;">+3.1</td>
|
|
<td class="num heat" style="--heat: 29;">+14.6</td>
|
|
<td class="num heat" style="--heat: 30;">+15.2</td>
|
|
<td class="num heat" style="--heat: 4;">+2.1</td>
|
|
<td class="num heat" style="--heat: 58;">+29.6</td>
|
|
<td class="num heat" style="--heat: 100;">+50.7</td>
|
|
<td class="num heat-avg">+19.2</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/qwen-color.png" alt="Qwen logo">Qwen3.6-35B-A3B</span></td>
|
|
<td>Direct chat</td>
|
|
<td class="num heat" style="--heat: 15;">+7.6</td>
|
|
<td class="num heat" style="--heat: 18;">+9.3</td>
|
|
<td class="num heat" style="--heat: 2;">+1.2</td>
|
|
<td class="num heat" style="--heat: 7;">+3.8</td>
|
|
<td class="num heat" style="--heat: 21;">+10.4</td>
|
|
<td class="num heat" style="--heat: 44;">+22.4</td>
|
|
<td class="num heat-avg">+9.1</td>
|
|
</tr>
|
|
<tr class="harness-group">
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.5</span></td>
|
|
<td>Codex</td>
|
|
<td class="num heat" style="--heat: 11;">+5.5</td>
|
|
<td class="num heat" style="--heat: 100;">+57.5</td>
|
|
<td class="num heat" style="--heat: 25;">+12.8</td>
|
|
<td class="num heat" style="--heat: 10;">+5.0</td>
|
|
<td class="num heat" style="--heat: 55;">+28.0</td>
|
|
<td class="num">N/A</td>
|
|
<td class="num heat-avg">+21.8</td>
|
|
</tr>
|
|
<tr>
|
|
<td><span class="model-cell"><img class="model-icon" src="skillopt-assets/openai.png" alt="OpenAI logo">GPT-5.5</span></td>
|
|
<td>Claude Code</td>
|
|
<td class="num heat" style="--heat: 8;">+4.0</td>
|
|
<td class="num heat" style="--heat: 100;">+58.3</td>
|
|
<td class="num heat" style="--heat: 27;">+13.9</td>
|
|
<td class="num heat" style="--heat: 7;">+3.5</td>
|
|
<td class="num heat" style="--heat: 26;">+13.3</td>
|
|
<td class="num">N/A</td>
|
|
<td class="num heat-avg">+18.6</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
|
|
<section class="comparison-frame" aria-labelledby="comparison-title">
|
|
<div class="comparison-head">
|
|
<div class="comparison-heading">
|
|
<span>Method comparison</span>
|
|
<h3 id="comparison-title">SkillOpt clears the strongest baseline on every benchmark.</h3>
|
|
</div>
|
|
<div class="comparison-legend" id="method-comparison-legend" aria-label="Compared methods"></div>
|
|
</div>
|
|
<div class="comparison-grid" id="method-comparison-grid" aria-label="Per-benchmark method comparison"></div>
|
|
</section>
|
|
|
|
</section>
|
|
|
|
<section class="section" id="ablations">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">04 / Ablations</div>
|
|
<div>
|
|
<h2>The controls are doing real work.</h2>
|
|
<p class="section-lede">
|
|
The paper isolates the optimizer components that keep skill learning stable:
|
|
enough evidence, bounded textual updates, rejected-edit feedback, slow
|
|
update, and optimizer-side memory.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="ablation-layout">
|
|
<div class="table-wrap">
|
|
<table aria-label="Component ablations">
|
|
<thead>
|
|
<tr>
|
|
<th>Component</th>
|
|
<th>Setting</th>
|
|
<th class="num">SearchQA</th>
|
|
<th class="num">Spreadsheet</th>
|
|
<th class="num">LiveMath</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>Learning rate</td>
|
|
<td>lr=4 default</td>
|
|
<td class="num"><strong>87.1</strong></td>
|
|
<td class="num"><strong>77.5</strong></td>
|
|
<td class="num"><strong>61.3</strong></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Learning rate</td>
|
|
<td>without lr</td>
|
|
<td class="num">84.6</td>
|
|
<td class="num">75.7</td>
|
|
<td class="num">57.3</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Rejected buffer</td>
|
|
<td>with buffer</td>
|
|
<td class="num"><strong>87.1</strong></td>
|
|
<td class="num"><strong>77.5</strong></td>
|
|
<td class="num"><strong>61.3</strong></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Rejected buffer</td>
|
|
<td>without buffer</td>
|
|
<td class="num">85.5</td>
|
|
<td class="num">72.9</td>
|
|
<td class="num">58.9</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Update memory</td>
|
|
<td>meta skill + slow update</td>
|
|
<td class="num"><strong>87.1</strong></td>
|
|
<td class="num"><strong>77.5</strong></td>
|
|
<td class="num"><strong>61.3</strong></td>
|
|
</tr>
|
|
<tr>
|
|
<td>Update memory</td>
|
|
<td>without both</td>
|
|
<td class="num">86.3</td>
|
|
<td class="num">55.0</td>
|
|
<td class="num">59.7</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
|
|
<article class="panel ablation-summary">
|
|
<h3>What the ablations say</h3>
|
|
<div class="mini-list">
|
|
<div class="mini-item">
|
|
<b>Bounded</b>
|
|
<span>Textual learning rates prevent destructive rewrites while keeping enough plasticity to learn new procedures.</span>
|
|
</div>
|
|
<div class="mini-item">
|
|
<b>Gated</b>
|
|
<span>Held-out selection turns reflection into propose-and-test optimization rather than unconditional self-editing.</span>
|
|
</div>
|
|
<div class="mini-item">
|
|
<b>Buffered</b>
|
|
<span>Rejected edits become negative feedback, helping the optimizer avoid repeating harmful directions.</span>
|
|
</div>
|
|
</div>
|
|
</article>
|
|
</div>
|
|
|
|
<figure class="figure-frame">
|
|
<img src="skillopt-assets/epoch-trends-1.png" alt="Epoch checkpoint trends for SpreadsheetBench, SearchQA, and LiveMath.">
|
|
<figcaption class="caption">
|
|
Epoch checkpoint trends from the paper. Selection-best checkpoints are compared with train rollout score and unseen test performance.
|
|
</figcaption>
|
|
</figure>
|
|
</section>
|
|
|
|
<section class="section" id="evolution">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">05 / Skill Evolution</div>
|
|
<div>
|
|
<h2>A typical run turns failures into concrete operating rules.</h2>
|
|
<p class="section-lede">
|
|
This ALFWorld run uses GPT-5.4-mini as the frozen target model and
|
|
GPT-5.5 as the optimizer model. The plot tracks train rollout and
|
|
held-out selection scores; hover or focus a point to inspect the
|
|
skill edit proposed at that stage.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="evolution-shell">
|
|
<article class="evolution-chart" aria-label="ALFWorld skill evolution chart">
|
|
<div class="chart-toolbar">
|
|
<span>ALFWorld / train-sel evolution</span>
|
|
<div class="chart-legend" aria-label="Chart legend">
|
|
<span class="legend-item" style="--legend: var(--teal)">Train rollout</span>
|
|
<span class="legend-item" style="--legend: var(--blue)">Selection gate</span>
|
|
</div>
|
|
</div>
|
|
<div class="chart-scroller">
|
|
<svg class="skill-chart" viewBox="0 0 790 340" role="img" aria-labelledby="evolution-chart-title evolution-chart-desc">
|
|
<title id="evolution-chart-title">ALFWorld skill evolution scores</title>
|
|
<desc id="evolution-chart-desc">Selection score rises from 68.6 percent to 81.4 percent, while rejected edits are visible as downward candidate points.</desc>
|
|
<line class="chart-grid" x1="70" y1="60" x2="730" y2="60"></line>
|
|
<line class="chart-grid" x1="70" y1="115" x2="730" y2="115"></line>
|
|
<line class="chart-grid" x1="70" y1="170" x2="730" y2="170"></line>
|
|
<line class="chart-grid" x1="70" y1="225" x2="730" y2="225"></line>
|
|
<line class="chart-grid" x1="70" y1="280" x2="730" y2="280"></line>
|
|
<line class="chart-axis" x1="70" y1="280" x2="730" y2="280"></line>
|
|
<line class="chart-axis" x1="70" y1="60" x2="70" y2="280"></line>
|
|
<text class="chart-label" x="25" y="64">85%</text>
|
|
<text class="chart-label" x="25" y="119">80%</text>
|
|
<text class="chart-label" x="25" y="174">75%</text>
|
|
<text class="chart-label" x="25" y="229">70%</text>
|
|
<text class="chart-label" x="25" y="284">65%</text>
|
|
<text class="chart-label" x="50" y="318">base</text>
|
|
<text class="chart-label" x="181" y="318">step 1</text>
|
|
<text class="chart-label" x="311" y="318">step 2</text>
|
|
<text class="chart-label" x="441" y="318">step 3</text>
|
|
<text class="chart-label" x="563" y="318">slow</text>
|
|
<text class="chart-label" x="701" y="318">step 4</text>
|
|
<polyline class="line-selection" points="70,240.7 200,201.4 330,162.1 460,232.9 590,99.3 720,146.4"></polyline>
|
|
<polyline class="line-train" points="200,238.8 330,156.3 460,142.5 590,115 720,87.5"></polyline>
|
|
<g class="chart-point" data-index="0" data-state="baseline" tabindex="0" role="button" aria-label="Baseline selection score 68.6 percent">
|
|
<circle class="hit" cx="70" cy="240.7" r="12"></circle>
|
|
<circle cx="70" cy="240.7" r="5"></circle>
|
|
</g>
|
|
<g class="chart-point" data-index="1" data-state="accepted" tabindex="0" role="button" aria-label="Step 1 accepted, selection score 72.1 percent">
|
|
<circle class="hit" cx="200" cy="201.4" r="12"></circle>
|
|
<circle cx="200" cy="201.4" r="5"></circle>
|
|
<circle cx="200" cy="238.8" r="4"></circle>
|
|
</g>
|
|
<g class="chart-point" data-index="2" data-state="accepted" tabindex="0" role="button" aria-label="Step 2 accepted, selection score 75.7 percent">
|
|
<circle class="hit" cx="330" cy="162.1" r="12"></circle>
|
|
<circle cx="330" cy="162.1" r="5"></circle>
|
|
<circle cx="330" cy="156.3" r="4"></circle>
|
|
</g>
|
|
<g class="chart-point" data-index="3" data-state="rejected" tabindex="0" role="button" aria-label="Step 3 rejected, candidate selection score 69.3 percent">
|
|
<circle class="hit" cx="460" cy="232.9" r="12"></circle>
|
|
<circle cx="460" cy="232.9" r="5"></circle>
|
|
<circle cx="460" cy="142.5" r="4"></circle>
|
|
</g>
|
|
<g class="chart-point" data-index="4" data-state="slow" tabindex="0" role="button" aria-label="Slow update accepted, selection score 81.4 percent">
|
|
<circle class="hit" cx="590" cy="99.3" r="12"></circle>
|
|
<circle cx="590" cy="99.3" r="5"></circle>
|
|
<circle cx="590" cy="115" r="4"></circle>
|
|
</g>
|
|
<g class="chart-point" data-index="5" data-state="rejected" tabindex="0" role="button" aria-label="Step 4 rejected, candidate selection score 77.1 percent">
|
|
<circle class="hit" cx="720" cy="146.4" r="12"></circle>
|
|
<circle cx="720" cy="146.4" r="5"></circle>
|
|
<circle cx="720" cy="87.5" r="4"></circle>
|
|
</g>
|
|
</svg>
|
|
</div>
|
|
<div class="chart-caption">
|
|
<span>Accepted edits become the current skill only after held-out selection improves.</span>
|
|
<span>Step 3 is rescued by a slow update; Step 4 trains higher but fails selection.</span>
|
|
</div>
|
|
</article>
|
|
|
|
<aside class="evolution-detail" aria-live="polite">
|
|
<div class="detail-kicker">
|
|
<span id="evo-step">Slow update</span>
|
|
<span class="detail-badge" id="evo-status">Accepted</span>
|
|
</div>
|
|
<h3 id="evo-title">Epoch 3 slow update</h3>
|
|
<div class="detail-metrics">
|
|
<div class="detail-metric">
|
|
<span>Train rollout</span>
|
|
<b id="evo-train">80.0%</b>
|
|
</div>
|
|
<div class="detail-metric">
|
|
<span>Selection gate</span>
|
|
<b id="evo-selection">81.4%</b>
|
|
</div>
|
|
</div>
|
|
<p class="detail-summary" id="evo-summary">
|
|
Longitudinal comparison found no regressions and three improvements, so a broader search-memory update became the new best skill.
|
|
</p>
|
|
<ul class="detail-edits" id="evo-edits">
|
|
<li>Count any generic target receptacle instance as valid.</li>
|
|
<li>Keep a strict numbered searched set and do not re-check observed locations.</li>
|
|
<li>Broaden search after several misses in one location type.</li>
|
|
</ul>
|
|
</aside>
|
|
</div>
|
|
|
|
<div class="evolution-footnotes">
|
|
<div class="evolution-note">
|
|
<b>Run setup</b>
|
|
Target model: GPT-5.4-mini. Optimizer model: GPT-5.5. The skill starts from a compact ALFWorld instruction file and is edited in text space.
|
|
</div>
|
|
<div class="evolution-note">
|
|
<b>Selection rule</b>
|
|
Candidate edits are accepted only when held-out selection improves the current best score.
|
|
</div>
|
|
<div class="evolution-note">
|
|
<b>Outcome</b>
|
|
The selected skill improves final ALFWorld test hard score from 70.9% to 85.8%.
|
|
</div>
|
|
</div>
|
|
</section>
|
|
|
|
<section class="section" id="transfer">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">06 / Transfer</div>
|
|
<div>
|
|
<h2>The exported skill behaves like a reusable artifact.</h2>
|
|
<p class="section-lede">
|
|
SkillOpt exports a compact <code>best_skill.md</code>. The paper tests
|
|
whether that artifact transfers across model sizes, execution harnesses,
|
|
and nearby benchmarks without further target-side optimization.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="transfer-grid">
|
|
<article class="transfer">
|
|
<span>Cross-model</span>
|
|
<span class="big">+15.2</span>
|
|
<p>GPT-5.4 LiveMath skill transferred to GPT-5.4-nano on LiveMathBench.</p>
|
|
</article>
|
|
<article class="transfer">
|
|
<span>Cross-harness</span>
|
|
<span class="big">+31.8</span>
|
|
<p>Codex-trained SpreadsheetBench skill transferred into Claude Code.</p>
|
|
</article>
|
|
<article class="transfer">
|
|
<span>Self-optimizer</span>
|
|
<span class="big">+10.4</span>
|
|
<p>GPT-5.4-nano used as its own optimizer improved SpreadsheetBench over baseline.</p>
|
|
</article>
|
|
<article class="transfer">
|
|
<span>Deployment</span>
|
|
<span class="big">1 file</span>
|
|
<p>The target model consumes only the final skill, not optimizer memory.</p>
|
|
</article>
|
|
</div>
|
|
|
|
<div class="callout">
|
|
A stronger optimizer model gives the largest gains, but the loop is not merely
|
|
distillation from a stronger model. Even matched target-as-optimizer settings
|
|
can discover useful edits when the update is constrained, buffered, and
|
|
validated.
|
|
</div>
|
|
</section>
|
|
|
|
<section class="section" id="citation">
|
|
<div class="section-header">
|
|
<div class="section-eyebrow">07 / BibTeX</div>
|
|
<div>
|
|
<h2>Citation.</h2>
|
|
<p class="section-lede">
|
|
If you find SkillOpt useful, please cite the arXiv preprint below.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="bibtex-box">
|
|
<button class="copy-btn" type="button" onclick="copyBibtex(this)">Copy</button>
|
|
<pre><code>@article{skillopt2026,
|
|
title = {SkillOpt: Executive Strategy for Self-Evolving Agent Skills},
|
|
author = {SkillOpt Authors},
|
|
year = {2026},
|
|
eprint = {2605.23904},
|
|
archivePrefix = {arXiv},
|
|
primaryClass = {cs.LG},
|
|
url = {https://arxiv.org/abs/2605.23904}
|
|
}</code></pre>
|
|
</div>
|
|
</section>
|
|
|
|
<footer class="footer">
|
|
<span>SkillOpt: Executive Strategy for Self-Evolving Agent Skills</span>
|
|
<span><a href="https://github.com/microsoft/SkillOpt" target="_blank" rel="noopener">Code</a> / <a href="#citation">Citation</a></span>
|
|
</footer>
|
|
</main>
|
|
<script>
|
|
(function () {
|
|
const navbar = document.getElementById("navbar");
|
|
const updateNavbar = () => {
|
|
if (!navbar) return;
|
|
navbar.classList.toggle("scrolled", window.scrollY > 40);
|
|
};
|
|
updateNavbar();
|
|
window.addEventListener("scroll", updateNavbar, { passive: true });
|
|
|
|
const revealSelector = [
|
|
".teaser-showcase",
|
|
".section-header",
|
|
".manifesto",
|
|
".method-grid",
|
|
".figure-frame",
|
|
".table-wrap",
|
|
".comparison-frame",
|
|
".ablation-layout",
|
|
".evolution-shell",
|
|
".evolution-footnotes",
|
|
".transfer-grid",
|
|
".callout",
|
|
".bibtex-box"
|
|
].join(",");
|
|
const revealNodes = Array.from(document.querySelectorAll(revealSelector));
|
|
revealNodes.forEach((node) => node.classList.add("reveal"));
|
|
|
|
if ("IntersectionObserver" in window) {
|
|
const observer = new IntersectionObserver((entries) => {
|
|
entries.forEach((entry) => {
|
|
if (entry.isIntersecting) {
|
|
entry.target.classList.add("visible");
|
|
}
|
|
});
|
|
}, { threshold: 0, rootMargin: "0px 0px -8% 0px" });
|
|
revealNodes.forEach((node) => observer.observe(node));
|
|
} else {
|
|
revealNodes.forEach((node) => node.classList.add("visible"));
|
|
}
|
|
})();
|
|
|
|
function copyBibtex(btn) {
|
|
const box = btn.closest(".bibtex-box");
|
|
const text = box ? box.querySelector("code").textContent : "";
|
|
navigator.clipboard.writeText(text).then(() => {
|
|
btn.textContent = "Copied!";
|
|
btn.classList.add("copied");
|
|
setTimeout(() => {
|
|
btn.textContent = "Copy";
|
|
btn.classList.remove("copied");
|
|
}, 2000);
|
|
});
|
|
}
|
|
|
|
const methodComparison = {
|
|
methods: [
|
|
{ key: "NoSkill", label: "No skill", color: "#94a3b8" },
|
|
{ key: "Human", label: "Human skill", color: "#7c3aed" },
|
|
{ key: "LLM", label: "LLM skill", color: "#4f46e5" },
|
|
{ key: "Trace", label: "Trace2Skill", color: "#0ea5e9" },
|
|
{ key: "TextGrad", label: "TextGrad", color: "#ec4899" },
|
|
{ key: "GEPA", label: "GEPA", color: "#f59e0b" },
|
|
{ key: "Ours", label: "SkillOpt", color: "#16a34a" }
|
|
],
|
|
benches: [
|
|
{
|
|
name: "SearchQA",
|
|
yMin: 65,
|
|
yMax: 85,
|
|
values: { NoSkill: 71.3, Human: 74.5, LLM: 72.9, Trace: 76.5, TextGrad: 76.4, GEPA: 78.1, Ours: 80.0 }
|
|
},
|
|
{
|
|
name: "SpreadsheetBench",
|
|
yMin: 25,
|
|
yMax: 55,
|
|
values: { NoSkill: 32.6, Human: 46.5, LLM: 35.9, Trace: 39.3, TextGrad: 31.3, GEPA: 47.3, Ours: 51.7 }
|
|
},
|
|
{
|
|
name: "OfficeQA",
|
|
yMin: 25,
|
|
yMax: 55,
|
|
values: { NoSkill: 31.0, Human: 48.3, LLM: 29.0, Trace: 35.1, TextGrad: 38.7, GEPA: 47.8, Ours: 52.4 }
|
|
},
|
|
{
|
|
name: "DocVQA",
|
|
yMin: 70,
|
|
yMax: 92,
|
|
values: { NoSkill: 72.3, Human: 86.0, LLM: 86.0, Trace: 87.4, TextGrad: 83.2, GEPA: 85.0, Ours: 89.1 }
|
|
},
|
|
{
|
|
name: "LiveMath",
|
|
yMin: 20,
|
|
yMax: 45,
|
|
values: { NoSkill: 26.7, Human: 28.1, LLM: 29.1, Trace: 33.7, TextGrad: 24.9, GEPA: 32.2, Ours: 42.9 }
|
|
},
|
|
{
|
|
name: "ALFWorld",
|
|
yMin: 50,
|
|
yMax: 87,
|
|
values: { NoSkill: 60.8, Human: 54.7, LLM: 70.4, Trace: 73.4, TextGrad: 67.5, GEPA: 75.4, Ours: 84.3 }
|
|
}
|
|
]
|
|
};
|
|
|
|
function renderMethodComparison() {
|
|
const grid = document.getElementById("method-comparison-grid");
|
|
const legend = document.getElementById("method-comparison-legend");
|
|
if (!grid || !legend) return;
|
|
|
|
methodComparison.methods.forEach((method) => {
|
|
const chip = document.createElement("span");
|
|
chip.className = "legend-chip";
|
|
chip.style.setProperty("--color", method.color);
|
|
chip.textContent = method.label;
|
|
legend.appendChild(chip);
|
|
});
|
|
|
|
methodComparison.benches.forEach((bench) => {
|
|
const panel = document.createElement("article");
|
|
panel.className = "benchmark-panel";
|
|
|
|
const top = document.createElement("div");
|
|
top.className = "benchmark-top";
|
|
|
|
const title = document.createElement("h4");
|
|
title.textContent = bench.name;
|
|
|
|
const ours = bench.values.Ours;
|
|
const bestBaseline = Math.max(
|
|
...methodComparison.methods
|
|
.filter((method) => method.key !== "Ours")
|
|
.map((method) => bench.values[method.key])
|
|
);
|
|
const delta = ours - bestBaseline;
|
|
|
|
const deltaPill = document.createElement("span");
|
|
deltaPill.className = "delta-pill";
|
|
deltaPill.textContent = `SkillOpt +${delta.toFixed(1)}`;
|
|
|
|
top.appendChild(title);
|
|
top.appendChild(deltaPill);
|
|
|
|
const stage = document.createElement("div");
|
|
stage.className = "bar-stage";
|
|
|
|
const axis = document.createElement("span");
|
|
axis.className = "axis-range";
|
|
axis.textContent = `y ${bench.yMin}-${bench.yMax}`;
|
|
stage.appendChild(axis);
|
|
|
|
methodComparison.methods.forEach((method) => {
|
|
const value = bench.values[method.key];
|
|
const height = Math.max(4, ((value - bench.yMin) / (bench.yMax - bench.yMin)) * 100);
|
|
const bar = document.createElement("div");
|
|
bar.className = method.key === "Ours" ? "method-bar skillopt" : "method-bar";
|
|
bar.style.setProperty("--h", `${height}%`);
|
|
bar.style.setProperty("--color", method.color);
|
|
bar.title = `${method.label}: ${value.toFixed(1)}`;
|
|
bar.setAttribute("aria-label", `${bench.name} ${method.label} score ${value.toFixed(1)}`);
|
|
|
|
if (method.key === "Ours") {
|
|
const valueLabel = document.createElement("span");
|
|
valueLabel.textContent = value.toFixed(1);
|
|
bar.appendChild(valueLabel);
|
|
}
|
|
|
|
stage.appendChild(bar);
|
|
});
|
|
|
|
panel.appendChild(top);
|
|
panel.appendChild(stage);
|
|
grid.appendChild(panel);
|
|
});
|
|
}
|
|
|
|
renderMethodComparison();
|
|
|
|
const evolutionSteps = [
|
|
{
|
|
step: "Baseline",
|
|
status: "Initial",
|
|
title: "Initial ALFWorld skill",
|
|
train: "-",
|
|
selection: "68.6%",
|
|
summary: "The starting skill solves many direct cases, but failures cluster around repeated search, loose object matching, and unfinished pick-two progress.",
|
|
edits: [
|
|
"Generic search and delivery rules, with no persistent numbered-location memory.",
|
|
"Selection baseline before any optimizer edit is applied.",
|
|
"The run uses this score as the acceptance floor for future candidates."
|
|
]
|
|
},
|
|
{
|
|
step: "Step 1",
|
|
status: "Accepted",
|
|
title: "Search memory and exact targets",
|
|
train: "68.8%",
|
|
selection: "72.1%",
|
|
summary: "The first accepted edit fixes recurring navigation loops and makes object matching stricter.",
|
|
edits: [
|
|
"Add a persistent checklist for observed receptacles, surfaces, containers, and appliances.",
|
|
"Use semantic search priors, then broaden without revisiting checked locations.",
|
|
"Require exact object nouns; do not treat similar items as substitutes."
|
|
]
|
|
},
|
|
{
|
|
step: "Step 2",
|
|
status: "Accepted",
|
|
title: "Delivery, transforms, and pick-two bookkeeping",
|
|
train: "76.3%",
|
|
selection: "75.7%",
|
|
summary: "The second accepted edit turns several procedural hints into executable rules for ALFWorld task types.",
|
|
edits: [
|
|
"Open the target receptacle if needed and place the held goal object directly.",
|
|
"Treat clean, heat, and cool adjectives as mandatory transformations.",
|
|
"For pick-two tasks, place one object, count progress, then fetch only the remaining instance."
|
|
]
|
|
},
|
|
{
|
|
step: "Step 3",
|
|
status: "Rejected",
|
|
title: "Candidate overfits search bookkeeping",
|
|
train: "77.5%",
|
|
selection: "69.3%",
|
|
summary: "The local train rollout improves, but held-out selection drops below the current best, so the candidate is rejected.",
|
|
edits: [
|
|
"Proposed a more executable search ledger in every search-oriented thought.",
|
|
"The gate prevents this narrower rewrite from replacing the stronger Step 2 skill.",
|
|
"Rejected-edit feedback is kept as negative evidence for future updates."
|
|
]
|
|
},
|
|
{
|
|
step: "Slow update",
|
|
status: "Accepted",
|
|
title: "Epoch 3 slow update",
|
|
train: "80.0%",
|
|
selection: "81.4%",
|
|
summary: "Longitudinal comparison finds no regressions and three improvements, so a broader search-memory update becomes the new best skill.",
|
|
edits: [
|
|
"Count any generic target receptacle instance as valid.",
|
|
"Keep a strict numbered searched set and do not re-check observed locations.",
|
|
"Broaden search after several misses in one location type."
|
|
]
|
|
},
|
|
{
|
|
step: "Step 4",
|
|
status: "Rejected",
|
|
title: "Higher train score, lower selection score",
|
|
train: "82.5%",
|
|
selection: "77.1%",
|
|
summary: "The final candidate looks better on the training batch but fails to beat the slow-update checkpoint on selection.",
|
|
edits: [
|
|
"Tried to make numbered-location memory even more explicit.",
|
|
"Added stronger failed-route marking after repeated impossible moves.",
|
|
"Selection rejects it, preserving the 81.4% slow-update best skill."
|
|
]
|
|
}
|
|
];
|
|
|
|
const pointNodes = document.querySelectorAll(".chart-point");
|
|
const detailFields = {
|
|
step: document.getElementById("evo-step"),
|
|
status: document.getElementById("evo-status"),
|
|
title: document.getElementById("evo-title"),
|
|
train: document.getElementById("evo-train"),
|
|
selection: document.getElementById("evo-selection"),
|
|
summary: document.getElementById("evo-summary"),
|
|
edits: document.getElementById("evo-edits")
|
|
};
|
|
|
|
function showEvolutionStep(index) {
|
|
const item = evolutionSteps[index];
|
|
if (!item || !detailFields.title) return;
|
|
|
|
detailFields.step.textContent = item.step;
|
|
detailFields.status.textContent = item.status;
|
|
detailFields.title.textContent = item.title;
|
|
detailFields.train.textContent = item.train;
|
|
detailFields.selection.textContent = item.selection;
|
|
detailFields.summary.textContent = item.summary;
|
|
detailFields.edits.innerHTML = "";
|
|
|
|
item.edits.forEach((edit) => {
|
|
const li = document.createElement("li");
|
|
li.textContent = edit;
|
|
detailFields.edits.appendChild(li);
|
|
});
|
|
|
|
pointNodes.forEach((node) => {
|
|
node.classList.toggle("is-active", Number(node.dataset.index) === index);
|
|
});
|
|
}
|
|
|
|
pointNodes.forEach((node) => {
|
|
const index = Number(node.dataset.index);
|
|
node.addEventListener("mouseenter", () => showEvolutionStep(index));
|
|
node.addEventListener("focus", () => showEvolutionStep(index));
|
|
node.addEventListener("click", () => showEvolutionStep(index));
|
|
});
|
|
|
|
showEvolutionStep(4);
|
|
</script>
|
|
</body>
|
|
</html>
|