perf(opencode): batch snapshot diffFull blob reads (#20752)

Co-authored-by: Nate Williams <50088025+natewill@users.noreply.github.com>
pull/20958/head
Kit Langton 2026-04-03 21:05:23 -04:00 committed by GitHub
parent 59ca4543d8
commit 288eb044cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 270 additions and 23 deletions

View File

@ -437,6 +437,146 @@ export namespace Snapshot {
const diffFull = Effect.fnUntraced(function* (from: string, to: string) { const diffFull = Effect.fnUntraced(function* (from: string, to: string) {
return yield* locked( return yield* locked(
Effect.gen(function* () { Effect.gen(function* () {
type Row = {
file: string
status: "added" | "deleted" | "modified"
binary: boolean
additions: number
deletions: number
}
type Ref = {
file: string
side: "before" | "after"
ref: string
}
const show = Effect.fnUntraced(function* (row: Row) {
if (row.binary) return ["", ""]
if (row.status === "added") {
return [
"",
yield* git([...cfg, ...args(["show", `${to}:${row.file}`])]).pipe(
Effect.map((item) => item.text),
),
]
}
if (row.status === "deleted") {
return [
yield* git([...cfg, ...args(["show", `${from}:${row.file}`])]).pipe(
Effect.map((item) => item.text),
),
"",
]
}
return yield* Effect.all(
[
git([...cfg, ...args(["show", `${from}:${row.file}`])]).pipe(Effect.map((item) => item.text)),
git([...cfg, ...args(["show", `${to}:${row.file}`])]).pipe(Effect.map((item) => item.text)),
],
{ concurrency: 2 },
)
})
const load = Effect.fnUntraced(
function* (rows: Row[]) {
const refs = rows.flatMap((row) => {
if (row.binary) return []
if (row.status === "added")
return [{ file: row.file, side: "after", ref: `${to}:${row.file}` } satisfies Ref]
if (row.status === "deleted") {
return [{ file: row.file, side: "before", ref: `${from}:${row.file}` } satisfies Ref]
}
return [
{ file: row.file, side: "before", ref: `${from}:${row.file}` } satisfies Ref,
{ file: row.file, side: "after", ref: `${to}:${row.file}` } satisfies Ref,
]
})
if (!refs.length) return new Map<string, { before: string; after: string }>()
const proc = ChildProcess.make("git", [...cfg, ...args(["cat-file", "--batch"])], {
cwd: state.directory,
extendEnv: true,
stdin: Stream.make(new TextEncoder().encode(refs.map((item) => item.ref).join("\n") + "\n")),
})
const handle = yield* spawner.spawn(proc)
const [out, err] = yield* Effect.all(
[Stream.mkUint8Array(handle.stdout), Stream.mkString(Stream.decodeText(handle.stderr))],
{ concurrency: 2 },
)
const code = yield* handle.exitCode
if (code !== 0) {
log.info("git cat-file --batch failed during snapshot diff, falling back to per-file git show", {
stderr: err,
refs: refs.length,
})
return
}
const fail = (msg: string, extra?: Record<string, string>) => {
log.info(msg, { ...extra, refs: refs.length })
return undefined
}
const map = new Map<string, { before: string; after: string }>()
const dec = new TextDecoder()
let i = 0
// Parse the default `git cat-file --batch` stream: one header line,
// then exactly `size` bytes of blob content, then a trailing newline.
for (const ref of refs) {
let end = i
while (end < out.length && out[end] !== 10) end += 1
if (end >= out.length) {
return fail(
"git cat-file --batch returned a truncated header during snapshot diff, falling back to per-file git show",
)
}
const head = dec.decode(out.slice(i, end))
i = end + 1
const hit = map.get(ref.file) ?? { before: "", after: "" }
if (head.endsWith(" missing")) {
map.set(ref.file, hit)
continue
}
const match = head.match(/^[0-9a-f]+ blob (\d+)$/)
if (!match) {
return fail(
"git cat-file --batch returned an unexpected header during snapshot diff, falling back to per-file git show",
{ head },
)
}
const size = Number(match[1])
if (!Number.isInteger(size) || size < 0 || i + size >= out.length || out[i + size] !== 10) {
return fail(
"git cat-file --batch returned truncated content during snapshot diff, falling back to per-file git show",
{ head },
)
}
const text = dec.decode(out.slice(i, i + size))
if (ref.side === "before") hit.before = text
if (ref.side === "after") hit.after = text
map.set(ref.file, hit)
i += size + 1
}
if (i !== out.length) {
return fail(
"git cat-file --batch returned trailing data during snapshot diff, falling back to per-file git show",
)
}
return map
},
Effect.scoped,
Effect.catch(() =>
Effect.succeed<Map<string, { before: string; after: string }> | undefined>(undefined),
),
)
const result: Snapshot.FileDiff[] = [] const result: Snapshot.FileDiff[] = []
const status = new Map<string, "added" | "deleted" | "modified">() const status = new Map<string, "added" | "deleted" | "modified">()
@ -459,30 +599,45 @@ export namespace Snapshot {
}, },
) )
for (const line of numstat.text.trim().split("\n")) { const rows = numstat.text
if (!line) continue .trim()
.split("\n")
.filter(Boolean)
.flatMap((line) => {
const [adds, dels, file] = line.split("\t") const [adds, dels, file] = line.split("\t")
if (!file) continue if (!file) return []
const binary = adds === "-" && dels === "-" const binary = adds === "-" && dels === "-"
const [before, after] = binary
? ["", ""]
: yield* Effect.all(
[
git([...cfg, ...args(["show", `${from}:${file}`])]).pipe(Effect.map((item) => item.text)),
git([...cfg, ...args(["show", `${to}:${file}`])]).pipe(Effect.map((item) => item.text)),
],
{ concurrency: 2 },
)
const additions = binary ? 0 : parseInt(adds) const additions = binary ? 0 : parseInt(adds)
const deletions = binary ? 0 : parseInt(dels) const deletions = binary ? 0 : parseInt(dels)
result.push({ return [
{
file, file,
before, status: status.get(file) ?? "modified",
after, binary,
additions: Number.isFinite(additions) ? additions : 0, additions: Number.isFinite(additions) ? additions : 0,
deletions: Number.isFinite(deletions) ? deletions : 0, deletions: Number.isFinite(deletions) ? deletions : 0,
status: status.get(file) ?? "modified", } satisfies Row,
]
}) })
const step = 100
// Keep batches bounded so a large diff does not buffer every blob at once.
for (let i = 0; i < rows.length; i += step) {
const run = rows.slice(i, i + step)
const text = yield* load(run)
for (const row of run) {
const hit = text?.get(row.file) ?? { before: "", after: "" }
const [before, after] = row.binary ? ["", ""] : text ? [hit.before, hit.after] : yield* show(row)
result.push({
file: row.file,
before,
after,
additions: row.additions,
deletions: row.deletions,
status: row.status,
})
}
} }
return result return result

View File

@ -982,6 +982,98 @@ test("diffFull with new file additions", async () => {
}) })
}) })
test("diffFull with a large interleaved mixed diff", async () => {
await using tmp = await bootstrap()
await Instance.provide({
directory: tmp.path,
fn: async () => {
const ids = Array.from({ length: 60 }, (_, i) => i.toString().padStart(3, "0"))
const mod = ids.map((id) => fwd(tmp.path, "mix", `${id}-mod.txt`))
const del = ids.map((id) => fwd(tmp.path, "mix", `${id}-del.txt`))
const add = ids.map((id) => fwd(tmp.path, "mix", `${id}-add.txt`))
const bin = ids.map((id) => fwd(tmp.path, "mix", `${id}-bin.bin`))
await $`mkdir -p ${tmp.path}/mix`.quiet()
await Promise.all([
...mod.map((file, i) => Filesystem.write(file, `before-${ids[i]}\n🙂\nline`)),
...del.map((file, i) => Filesystem.write(file, `gone-${ids[i]}\n你好`)),
...bin.map((file, i) => Filesystem.write(file, new Uint8Array([0, i, 255, i % 251]))),
])
const before = await Snapshot.track()
expect(before).toBeTruthy()
await Promise.all([
...mod.map((file, i) => Filesystem.write(file, `after-${ids[i]}\n🚀\nline`)),
...add.map((file, i) => Filesystem.write(file, `new-${ids[i]}\nこんにちは`)),
...bin.map((file, i) => Filesystem.write(file, new Uint8Array([9, i, 8, i % 251]))),
...del.map((file) => fs.rm(file)),
])
const after = await Snapshot.track()
expect(after).toBeTruthy()
const diffs = await Snapshot.diffFull(before!, after!)
expect(diffs).toHaveLength(ids.length * 4)
const map = new Map(diffs.map((item) => [item.file, item]))
for (let i = 0; i < ids.length; i++) {
const m = map.get(fwd("mix", `${ids[i]}-mod.txt`))
expect(m).toBeDefined()
expect(m!.before).toBe(`before-${ids[i]}\n🙂\nline`)
expect(m!.after).toBe(`after-${ids[i]}\n🚀\nline`)
expect(m!.status).toBe("modified")
const d = map.get(fwd("mix", `${ids[i]}-del.txt`))
expect(d).toBeDefined()
expect(d!.before).toBe(`gone-${ids[i]}\n你好`)
expect(d!.after).toBe("")
expect(d!.status).toBe("deleted")
const a = map.get(fwd("mix", `${ids[i]}-add.txt`))
expect(a).toBeDefined()
expect(a!.before).toBe("")
expect(a!.after).toBe(`new-${ids[i]}\nこんにちは`)
expect(a!.status).toBe("added")
const b = map.get(fwd("mix", `${ids[i]}-bin.bin`))
expect(b).toBeDefined()
expect(b!.before).toBe("")
expect(b!.after).toBe("")
expect(b!.additions).toBe(0)
expect(b!.deletions).toBe(0)
expect(b!.status).toBe("modified")
}
},
})
})
test("diffFull preserves git diff order across batch boundaries", async () => {
await using tmp = await bootstrap()
await Instance.provide({
directory: tmp.path,
fn: async () => {
const ids = Array.from({ length: 140 }, (_, i) => i.toString().padStart(3, "0"))
await $`mkdir -p ${tmp.path}/order`.quiet()
await Promise.all(ids.map((id) => Filesystem.write(`${tmp.path}/order/${id}.txt`, `before-${id}`)))
const before = await Snapshot.track()
expect(before).toBeTruthy()
await Promise.all(ids.map((id) => Filesystem.write(`${tmp.path}/order/${id}.txt`, `after-${id}`)))
const after = await Snapshot.track()
expect(after).toBeTruthy()
const expected = ids.map((id) => `order/${id}.txt`)
const diffs = await Snapshot.diffFull(before!, after!)
expect(diffs.map((item) => item.file)).toEqual(expected)
},
})
})
test("diffFull with file modifications", async () => { test("diffFull with file modifications", async () => {
await using tmp = await bootstrap() await using tmp = await bootstrap()
await Instance.provide({ await Instance.provide({