fix(agent/codex): capture workDir under mutex in StartSession and SkillDirs (#1005)

StartSession at lines 333-352 acquires a.mu and reads seven fields
(mode, model, reasoningEffort, backend, appServerURL, codexHome,
cliBin, cliExtraArgs, provider config) — but a.workDir is read AFTER
a.mu.Unlock() at the two return sites (lines 364 and 370). SkillDirs
reads both a.workDir and a.codexHome without holding the lock.

SetWorkDir mutates a.workDir under a.mu, so /workspace landing on one
goroutine while a turn starts or a skill lookup runs on another races
on the string field. Strings carry a (ptr, len) pair; a torn read of a
new value of a different length can yield a frankenstring with the
pointer of one and the length of another, leading to memory-safety
failures.

Capture workDir (and codexHome in SkillDirs) inside the locked
section. Matches the existing pattern in ListSessions /
GetSessionHistory / DeleteSession on the same agent.

A regression test under -race spawns concurrent SetWorkDir writers
and SkillDirs readers. With the production fix the race detector
stays quiet; without it the test trips "DATA RACE detected".
This commit is contained in:
Shawn
2026-05-18 21:39:11 +08:00
committed by GitHub
parent d85377b55d
commit fdc6a9bc74
2 changed files with 41 additions and 5 deletions

View File

@@ -339,6 +339,7 @@ func (a *Agent) StartSession(ctx context.Context, sessionID string) (core.AgentS
codexHome := a.codexHome
cliBin := a.cliBin
cliExtraArgs := a.cliExtraArgs
workDir := a.workDir
extraEnv := a.providerEnvLocked()
extraEnv = append(extraEnv, a.sessionEnv...)
var baseURL string
@@ -361,13 +362,13 @@ func (a *Agent) StartSession(ctx context.Context, sessionID string) (core.AgentS
}
if backend == "app_server" {
return newAppServerSession(ctx, appServerURL, a.workDir, model, reasoningEffort, mode, sessionID, baseURL, provName, extraEnv, codexHome)
return newAppServerSession(ctx, appServerURL, workDir, model, reasoningEffort, mode, sessionID, baseURL, provName, extraEnv, codexHome)
}
if codexHome != "" {
extraEnv = append(extraEnv, "CODEX_HOME="+codexHome)
}
return newCodexSession(ctx, cliBin, cliExtraArgs, a.workDir, model, reasoningEffort, mode, sessionID, baseURL, extraEnv, provName)
return newCodexSession(ctx, cliBin, cliExtraArgs, workDir, model, reasoningEffort, mode, sessionID, baseURL, extraEnv, provName)
}
func (a *Agent) ListSessions(_ context.Context) ([]core.AgentSessionInfo, error) {
@@ -438,11 +439,15 @@ func (a *Agent) WorkspaceAgentOptions() map[string]any {
// ── SkillProvider implementation ──────────────────────────────
func (a *Agent) SkillDirs() []string {
absDir, err := filepath.Abs(a.workDir)
a.mu.RLock()
workDir := a.workDir
codexHome := a.codexHome
a.mu.RUnlock()
absDir, err := filepath.Abs(workDir)
if err != nil {
absDir = a.workDir
absDir = workDir
}
return codexSkillDirs(absDir, a.codexHome)
return codexSkillDirs(absDir, codexHome)
}
// ── ContextCompressor implementation ──────────────────────────

View File

@@ -4,6 +4,7 @@ import (
"os"
"path/filepath"
"runtime"
"sync"
"testing"
)
@@ -88,3 +89,33 @@ func setTestHome(t *testing.T, home string) {
t.Setenv("HOMEPATH", "")
}
}
// TestSkillDirs_RaceFreeAgainstSetWorkDir pins the bug where SkillDirs
// read a.workDir and a.codexHome without holding a.mu, while
// SetWorkDir writes a.workDir under the lock. Run with -race to detect
// the data race; with the production fix the test stays clean.
func TestSkillDirs_RaceFreeAgainstSetWorkDir(t *testing.T) {
tmp := t.TempDir()
a := &Agent{workDir: tmp, codexHome: filepath.Join(tmp, "codex")}
var wg sync.WaitGroup
for i := 0; i < 30; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
if i%2 == 0 {
a.SetWorkDir(filepath.Join(tmp, "a"))
} else {
a.SetWorkDir(filepath.Join(tmp, "b"))
}
}(i)
}
for i := 0; i < 30; i++ {
wg.Add(1)
go func() {
defer wg.Done()
_ = a.SkillDirs()
}()
}
wg.Wait()
}