Skip to content

Commit 7597a02

Browse files
EbonsignoriheiskrCopilot
authored
Cross-version deduplication for audit logs and GitHub Apps data (#61361)
Co-authored-by: Kevin Heis <heiskr@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6c65ed7 commit 7597a02

13 files changed

Lines changed: 754 additions & 19 deletions

File tree

src/audit-logs/data/shared/entries.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/audit-logs/data/shared/fields-pool.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/audit-logs/data/version-index.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

src/audit-logs/lib/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
"git": "Note: Git events have special access requirements and retention policies that differ from other audit log events. For GitHub Enterprise Cloud, access Git events via the REST API only with 7-day retention. For GitHub Enterprise Server, Git events must be enabled in audit log configuration and are not included in search results.",
1010
"sso_redirect": "Note: Automatically redirecting users to sign in is currently in beta for Enterprise Managed Users and subject to change."
1111
},
12-
"sha": "961b64fe9080f83cc3f408a8dd1e30fe2eafaee1"
12+
"sha": "ad0dc7ebe4a70afe77bb03487060481fe7a9f13a"
1313
}

src/audit-logs/lib/index.ts

Lines changed: 96 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import type {
1212
RawAuditLogEventT,
1313
CategoryNotes,
1414
AuditLogConfig,
15+
DeduplicatedAuditLogEntry,
16+
AuditLogVersionIndex,
1517
} from '../types'
1618
import config from './config.json'
1719

@@ -21,6 +23,86 @@ export const AUDIT_LOG_DATA_DIR = 'src/audit-logs/data'
2123
const auditLogEventsCache = new Map<string, Map<string, AuditLogEventT[]>>()
2224
const categorizedAuditLogEventsCache = new Map<string, Map<string, CategorizedEvents>>()
2325

26+
// Shared dedup data — loaded once, shared across all versions
27+
let sharedEntries: DeduplicatedAuditLogEntry[] | null = null
28+
let sharedFieldsPool: string[][] | null = null
29+
let sharedVersionIndex: AuditLogVersionIndex | null = null
30+
let sharedFormatAvailable: boolean | null = null // null = not checked yet
31+
32+
// A missing shared-format file is expected (per-version files are the fallback),
33+
// but a corrupt or unparseable file should fail loudly rather than silently
34+
// degrade to the per-version files and hide bad generated data.
35+
function isFileNotFoundError(err: unknown): boolean {
36+
if (!(err instanceof Error) || !('code' in err)) return false
37+
const code = (err as NodeJS.ErrnoException).code
38+
return code === 'ENOENT' || code === 'ENOTDIR'
39+
}
40+
41+
function loadSharedFormat(): boolean {
42+
if (sharedFormatAvailable !== null) return sharedFormatAvailable
43+
try {
44+
sharedEntries = readCompressedJsonFileFallback(
45+
path.join(AUDIT_LOG_DATA_DIR, 'shared', 'entries.json'),
46+
) as DeduplicatedAuditLogEntry[]
47+
sharedFieldsPool = readCompressedJsonFileFallback(
48+
path.join(AUDIT_LOG_DATA_DIR, 'shared', 'fields-pool.json'),
49+
) as string[][]
50+
sharedVersionIndex = readCompressedJsonFileFallback(
51+
path.join(AUDIT_LOG_DATA_DIR, 'version-index.json'),
52+
) as AuditLogVersionIndex
53+
// Freeze pool data so reconstructed events (which return references into
54+
// these pools) can't be mutated by downstream code and leak across versions.
55+
Object.freeze(sharedEntries)
56+
Object.freeze(sharedFieldsPool)
57+
for (const fields of sharedFieldsPool) Object.freeze(fields)
58+
sharedFormatAvailable = true
59+
} catch (err) {
60+
if (isFileNotFoundError(err)) {
61+
// Shared files don't exist — fall back to per-version files silently.
62+
sharedFormatAvailable = false
63+
} else {
64+
// Corrupt JSON, schema mismatch, etc. — surface this instead of hiding it.
65+
console.error('Failed to load shared audit log dedup format (corrupt data?):', err)
66+
throw err
67+
}
68+
}
69+
return sharedFormatAvailable
70+
}
71+
72+
function reconstructEventsFromSharedFormat(version: string, page: string): AuditLogEventT[] | null {
73+
if (!loadSharedFormat()) return null
74+
const indices = sharedVersionIndex?.[version]?.[page]
75+
if (!indices) return null
76+
77+
return indices.map((idx) => {
78+
if (idx < 0 || idx >= sharedEntries!.length) {
79+
throw new RangeError(
80+
`Audit log version-index references entry ${idx} for ${version}/${page}, ` +
81+
`but the entries pool only has ${sharedEntries!.length} entries. ` +
82+
`The shared dedup data may be stale or corrupt.`,
83+
)
84+
}
85+
const entry = sharedEntries![idx]
86+
const event: AuditLogEventT = {
87+
action: entry.action,
88+
description: entry.description,
89+
}
90+
if (entry.docs_reference_links) event.docs_reference_links = entry.docs_reference_links
91+
if (entry.docs_reference_titles) event.docs_reference_titles = entry.docs_reference_titles
92+
if (entry.fieldsIndex !== undefined) {
93+
if (entry.fieldsIndex < 0 || entry.fieldsIndex >= sharedFieldsPool!.length) {
94+
throw new RangeError(
95+
`Audit log entry references fields index ${entry.fieldsIndex} for ${version}/${page}, ` +
96+
`but the fields pool only has ${sharedFieldsPool!.length} entries. ` +
97+
`The shared dedup data may be stale or corrupt.`,
98+
)
99+
}
100+
event.fields = sharedFieldsPool![entry.fieldsIndex]
101+
}
102+
return event
103+
})
104+
}
105+
24106
type PipelineConfig = {
25107
sha: string
26108
appendedDescriptions: Record<string, string>
@@ -169,21 +251,24 @@ async function resolveReferenceLinksToTitles(
169251
// ]
170252
export function getAuditLogEvents(page: string, version: string): AuditLogEventT[] {
171253
const openApiVersion = getOpenApiVersion(version)
172-
const auditLogFileName = path.join(AUDIT_LOG_DATA_DIR, openApiVersion, `${page}.json`)
173254

174255
// If the data isn't cached for an entire version or a particular page, read
175-
// the data from the JSON file the first time around
256+
// the data from the shared dedup format or fall back to per-version JSON files
176257
if (!auditLogEventsCache.has(openApiVersion)) {
177258
auditLogEventsCache.set(openApiVersion, new Map())
178-
auditLogEventsCache.get(openApiVersion)?.set(page, [])
179-
auditLogEventsCache
180-
.get(openApiVersion)
181-
?.set(page, readCompressedJsonFileFallback(auditLogFileName) as AuditLogEventT[])
182-
} else if (!auditLogEventsCache.get(openApiVersion)?.has(page)) {
183-
auditLogEventsCache.get(openApiVersion)?.set(page, [])
184-
auditLogEventsCache
185-
.get(openApiVersion)
186-
?.set(page, readCompressedJsonFileFallback(auditLogFileName) as AuditLogEventT[])
259+
}
260+
if (!auditLogEventsCache.get(openApiVersion)?.has(page)) {
261+
// Try shared deduplicated format first
262+
const events = reconstructEventsFromSharedFormat(openApiVersion, page)
263+
if (events) {
264+
auditLogEventsCache.get(openApiVersion)?.set(page, events)
265+
} else {
266+
// Fall back to per-version JSON file
267+
const auditLogFileName = path.join(AUDIT_LOG_DATA_DIR, openApiVersion, `${page}.json`)
268+
auditLogEventsCache
269+
.get(openApiVersion)
270+
?.set(page, readCompressedJsonFileFallback(auditLogFileName) as AuditLogEventT[])
271+
}
187272
}
188273

189274
const auditLogEvents = auditLogEventsCache.get(openApiVersion)?.get(page)

src/audit-logs/scripts/sync.ts

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@ import { getContents, getCommitSha } from '@/workflows/git-utils'
1717
import { latest, latestStable, releaseCandidate } from '@/versions/lib/enterprise-server-releases'
1818
import { loadPages, loadPageMap } from '@/frame/lib/page-data'
1919
import loadRedirects from '@/redirects/lib/precompile'
20-
import type { AuditLogEventT, VersionedAuditLogData } from '../types'
20+
import type {
21+
AuditLogEventT,
22+
VersionedAuditLogData,
23+
DeduplicatedAuditLogEntry,
24+
AuditLogVersionIndex,
25+
} from '../types'
2126

2227
if (!process.env.GITHUB_TOKEN) {
2328
throw new Error('GITHUB_TOKEN environment variable must be set to run this script')
@@ -209,6 +214,78 @@ async function main() {
209214
}
210215
}
211216
}
217+
218+
// Write deduplicated shared format
219+
await writeDeduplicatedFormat(auditLogData)
220+
}
221+
222+
async function writeDeduplicatedFormat(auditLogData: VersionedAuditLogData) {
223+
console.log(`\n▶️ Writing deduplicated audit log data...\n`)
224+
225+
// Build fields pool: unique fields arrays
226+
const fieldsPool: string[][] = []
227+
const fieldsMap = new Map<string, number>() // JSON key → index
228+
229+
function getFieldsIndex(fields: string[] | undefined): number | undefined {
230+
if (!fields || fields.length === 0) return undefined
231+
const key = JSON.stringify(fields)
232+
if (fieldsMap.has(key)) return fieldsMap.get(key)!
233+
const index = fieldsPool.length
234+
fieldsPool.push(fields)
235+
fieldsMap.set(key, index)
236+
return index
237+
}
238+
239+
// Build entries pool: unique events (with fields replaced by index)
240+
const entriesPool: DeduplicatedAuditLogEntry[] = []
241+
const entriesMap = new Map<string, number>() // JSON key → index
242+
243+
function getEntryIndex(event: AuditLogEventT): number {
244+
const fieldsIndex = getFieldsIndex(event.fields)
245+
const entry: DeduplicatedAuditLogEntry = {
246+
action: event.action,
247+
description: event.description,
248+
}
249+
if (event.docs_reference_links) entry.docs_reference_links = event.docs_reference_links
250+
if (event.docs_reference_titles) entry.docs_reference_titles = event.docs_reference_titles
251+
if (fieldsIndex !== undefined) entry.fieldsIndex = fieldsIndex
252+
253+
const key = JSON.stringify(entry)
254+
if (entriesMap.has(key)) return entriesMap.get(key)!
255+
const index = entriesPool.length
256+
entriesPool.push(entry)
257+
entriesMap.set(key, index)
258+
return index
259+
}
260+
261+
// Build version index
262+
const versionIndex: AuditLogVersionIndex = {}
263+
let totalEntries = 0
264+
265+
for (const [version, pages] of Object.entries(auditLogData)) {
266+
versionIndex[version] = {}
267+
for (const [page, events] of Object.entries(pages)) {
268+
versionIndex[version][page] = events.map((event) => getEntryIndex(event))
269+
totalEntries += events.length
270+
}
271+
}
272+
273+
// Write shared files
274+
const sharedDir = path.join(AUDIT_LOG_DATA_DIR, 'shared')
275+
if (!existsSync(sharedDir)) {
276+
await mkdirp(sharedDir)
277+
}
278+
279+
await writeFile(path.join(sharedDir, 'entries.json'), JSON.stringify(entriesPool))
280+
await writeFile(path.join(sharedDir, 'fields-pool.json'), JSON.stringify(fieldsPool))
281+
await writeFile(path.join(AUDIT_LOG_DATA_DIR, 'version-index.json'), JSON.stringify(versionIndex))
282+
283+
const uniqueEntries = entriesPool.length
284+
const uniqueFields = fieldsPool.length
285+
const dedupRate = totalEntries > 0 ? ((1 - uniqueEntries / totalEntries) * 100).toFixed(1) : '0'
286+
console.log(
287+
`✅ Deduplicated audit log data: ${totalEntries} total → ${uniqueEntries} unique entries (${dedupRate}% dedup), ${uniqueFields} unique field lists`,
288+
)
212289
}
213290

214291
main()

0 commit comments

Comments
 (0)