Skip to content

Commit 77f7203

Browse files
authored
Merge pull request #54 from dacharyc/DOCSP-49695
(DOCSP-49695): Handle moved pages
2 parents 8e863d5 + 34f60e7 commit 77f7203

25 files changed

+621
-123
lines changed

audit/common/CodeNode.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@ import "time"
44

55
// CodeNode captures metadata about a specific code example. The `Code` field contains the example itself.
66
type CodeNode struct {
7-
Code string `bson:"code"`
8-
Language string `bson:"language"`
9-
FileExtension string `bson:"file_extension"`
10-
Category string `bson:"category"`
11-
SHA256Hash string `bson:"sha_256_hash"`
12-
LLMCategorized bool `bson:"llm_categorized"`
13-
DateAdded time.Time `bson:"date_added"`
14-
DateUpdated time.Time `bson:"date_updated,omitempty"`
15-
DateRemoved time.Time `bson:"date_removed,omitempty"`
16-
IsRemoved bool `bson:"is_removed,omitempty"`
7+
Code string `bson:"code"`
8+
Language string `bson:"language"`
9+
FileExtension string `bson:"file_extension"`
10+
Category string `bson:"category"`
11+
SHA256Hash string `bson:"sha_256_hash"`
12+
LLMCategorized bool `bson:"llm_categorized"`
13+
DateAdded time.Time `bson:"date_added"`
14+
DateUpdated time.Time `bson:"date_updated,omitempty"`
15+
DateRemoved time.Time `bson:"date_removed,omitempty"`
16+
IsRemoved bool `bson:"is_removed,omitempty"`
17+
InstancesOnPage int `bson:"instances_on_page,omitempty"`
1718
}

audit/gdcd/CheckPagesForUpdates.go

Lines changed: 84 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ package main
33
import (
44
"common"
55
"context"
6+
"fmt"
67
"gdcd/db"
8+
"gdcd/snooty"
79
"gdcd/types"
810
"gdcd/utils"
11+
"time"
912

1013
"github.com/tmc/langchaingo/llms/ollama"
1114
)
@@ -17,8 +20,13 @@ import (
1720
func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetails, llm *ollama.LLM, ctx context.Context, report types.ProjectReport) {
1821
incomingPageIdsMatchingExistingPages := make(map[string]bool)
1922
incomingDeletedPageCount := 0
20-
var newPageIds []string
21-
var newPages []common.DocsPage
23+
24+
// When a page doesn't match one in the DB, it could be either net new or a moved page. Hold it in a temp array
25+
// for comparison
26+
var maybeNewPages []types.NewOrMovedPage
27+
var newPages []types.NewOrMovedPage
28+
var newPageDBEntries []common.DocsPage
29+
var movedPages []types.NewOrMovedPage
2230
var updatedPages []common.DocsPage
2331
for _, page := range pages {
2432
// The Snooty Data API returns pages that may have been deleted. If the page is deleted, we want to check and see
@@ -27,6 +35,7 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail
2735
if page.Data.Deleted {
2836
report = HandleDeletedIncomingPages(project.ProjectName, page, report)
2937
incomingDeletedPageCount++
38+
utils.UpdateSecondaryTarget()
3039
} else {
3140
maybeExistingPage := CheckForExistingPage(project.ProjectName, page)
3241
if maybeExistingPage != nil {
@@ -39,20 +48,72 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail
3948
if updatedPage != nil {
4049
updatedPages = append(updatedPages, *updatedPage)
4150
}
51+
utils.UpdateSecondaryTarget()
4252
} else {
43-
// If there is no existing document in Atlas that matches the page, we need to make a new page
44-
var newPage common.DocsPage
45-
newPage, report = MakeNewPage(page, project.ProdUrl, report, llm, ctx)
46-
newPageIds = append(newPageIds, newPage.ID)
47-
newPages = append(newPages, newPage)
53+
// If there is no existing document in Atlas that matches the page, we need to make a new page. BUT!
54+
// It might actually be a new or moved page. So store it in a temp `maybeNewPages` slice so we can compare
55+
// it against removed pages later and potentially call it a "moved" page, instead.
56+
newOrMovedPage := getNewOrMovedPageDetails(page.Data)
57+
maybeNewPages = append(maybeNewPages, newOrMovedPage)
4858
}
4959
}
50-
utils.UpdateSecondaryTarget()
5160
}
5261

5362
// After iterating through the incoming pages from the Snooty Data API, we need to figure out if any of the page IDs
54-
// we had in the DB are not coming in from the incoming response. If so, we should delete those entries.
55-
report = db.HandleMissingPageIds(project.ProjectName, incomingPageIdsMatchingExistingPages, report)
63+
// we had in the DB are not coming in from the incoming response. If so, those pages are either moved or removed.
64+
report, newPages, movedPages = db.HandleMissingPageIds(project.ProjectName, incomingPageIdsMatchingExistingPages, maybeNewPages, report)
65+
66+
// If we have new pages, create the corresponding DocsPage and increment the project report for them
67+
if newPages != nil {
68+
for _, page := range newPages {
69+
newPage := MakeNewPage(page.PageData, project.ProjectName, project.ProdUrl, llm, ctx)
70+
newPageDBEntries = append(newPageDBEntries, newPage)
71+
report = UpdateProjectReportForNewPage(newPage, report)
72+
utils.UpdateSecondaryTarget()
73+
}
74+
}
75+
76+
// If we have moved pages, handle them
77+
if movedPages != nil {
78+
for _, page := range movedPages {
79+
var movedPage common.DocsPage
80+
oldPage := db.GetAtlasPageData(project.ProjectName, page.OldPageId)
81+
82+
if oldPage != nil {
83+
movedPage = *oldPage
84+
movedPage.ID = page.NewPageId
85+
newPageUrl := utils.ConvertAtlasPageIdToProductionUrl(page.NewPageId, project.ProdUrl)
86+
movedPage.DateLastUpdated = time.Now()
87+
movedPage.PageURL = newPageUrl
88+
} else {
89+
movedPage = MakeNewPage(page.PageData, project.ProjectName, project.ProdUrl, llm, ctx)
90+
movedPage.DateAdded = page.DateAdded
91+
}
92+
93+
// Remove the old page from the DB
94+
db.RemovePageFromAtlas(project.ProjectName, page.OldPageId)
95+
96+
// Append the "moved" page to the `newPageDBEntries` array. Because the page ID doesn't match the old one,
97+
// we write it to the DB as a new page. Because we just deleted the old page, it works out to the same count
98+
// and provides the up-to-date data in the DB.
99+
newPageDBEntries = append(newPageDBEntries, movedPage)
100+
101+
incomingAstCodeNodes, incomingAstLiteralIncludeNodes, incomingAstIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(page.PageData.AST)
102+
incomingAstCodeNodeCount := len(incomingAstCodeNodes)
103+
incomingAstLiteralIncludeNodesCount := len(incomingAstLiteralIncludeNodes)
104+
incomingAstIoCodeBlockNodesCount := len(incomingAstIoCodeBlockNodes)
105+
// Update the project counts for the "existing" page
106+
report = IncrementProjectCountsForExistingPage(incomingAstCodeNodeCount, incomingAstLiteralIncludeNodesCount, incomingAstIoCodeBlockNodesCount, movedPage, report)
107+
108+
// Report it in the logs as a moved page
109+
stringMessageForReport := fmt.Sprintf("Old page ID: %s, new page ID: %s", page.OldPageId, page.NewPageId)
110+
report = utils.ReportChanges(types.PageMoved, report, stringMessageForReport)
111+
if movedPage.CodeNodesTotal != incomingAstCodeNodeCount {
112+
utils.ReportIssues(types.CodeNodeCountIssue, report, page.NewPageId, page.CodeNodeCount, len(incomingAstCodeNodes))
113+
}
114+
utils.UpdateSecondaryTarget()
115+
}
116+
}
56117

57118
// Get the existing "summaries" document from the DB, and update it.
58119
var summaryDoc common.CollectionReport
@@ -65,5 +126,17 @@ func CheckPagesForUpdates(pages []types.PageWrapper, project types.ProjectDetail
65126
LogReportForProject(project.ProjectName, report)
66127

67128
// At this point, we have all the new and updated pages and an updated summary. Write updates to Atlas.
68-
db.BatchUpdateCollection(project.ProjectName, newPages, updatedPages, summaryDoc)
129+
db.BatchUpdateCollection(project.ProjectName, newPageDBEntries, updatedPages, summaryDoc)
130+
}
131+
132+
func getNewOrMovedPageDetails(metadata types.PageMetadata) types.NewOrMovedPage {
133+
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(metadata.AST)
134+
pageId := utils.ConvertSnootyPageIdToAtlasPageId(metadata.PageID)
135+
return types.NewOrMovedPage{
136+
PageId: pageId,
137+
CodeNodeCount: len(incomingCodeNodes),
138+
LiteralIncludeCount: len(incomingLiteralIncludeNodes),
139+
IoCodeBlockCount: len(incomingIoCodeBlockNodes),
140+
PageData: metadata,
141+
}
69142
}

audit/gdcd/MakeLanguagesArray.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,23 @@ func MakeLanguagesArray(codeNodes []common.CodeNode, literalIncludeNodes []types
1818
continue
1919
} else {
2020
if languageCounts, exists := languages[node.Language]; exists {
21-
languageCounts.Total += 1
22-
languages[node.Language] = languageCounts
21+
if node.InstancesOnPage > 0 {
22+
languageCounts.Total += node.InstancesOnPage
23+
languages[node.Language] = languageCounts
24+
} else {
25+
languageCounts.Total += 1
26+
languages[node.Language] = languageCounts
27+
}
2328
} else {
24-
countsForLang := languages[common.Undefined]
25-
countsForLang.LiteralIncludes += 1
26-
languages[common.Undefined] = countsForLang
29+
if node.InstancesOnPage > 0 {
30+
countsForLang := languages[common.Undefined]
31+
countsForLang.Total += node.InstancesOnPage
32+
languages[common.Undefined] = countsForLang
33+
} else {
34+
countsForLang := languages[common.Undefined]
35+
countsForLang.Total += 1
36+
languages[common.Undefined] = countsForLang
37+
}
2738
}
2839
}
2940
}

audit/gdcd/MakeNewPage.go

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package main
33
import (
44
"common"
55
"context"
6-
add_code_examples "gdcd/add-code-examples"
76
"gdcd/snooty"
87
"gdcd/types"
98
"gdcd/utils"
@@ -12,35 +11,51 @@ import (
1211
"github.com/tmc/langchaingo/llms/ollama"
1312
)
1413

15-
func MakeNewPage(data types.PageWrapper, siteUrl string, report types.ProjectReport, llm *ollama.LLM, ctx context.Context) (common.DocsPage, types.ProjectReport) {
16-
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.Data.AST)
14+
func MakeNewPage(data types.PageMetadata, projectName string, siteUrl string, llm *ollama.LLM, ctx context.Context) common.DocsPage {
15+
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.AST)
1716
incomingCodeNodeCount := len(incomingCodeNodes)
1817
incomingLiteralIncludeNodeCount := len(incomingLiteralIncludeNodes)
1918
incomingIoCodeNodeCount := len(incomingIoCodeBlockNodes)
20-
pageId := utils.ConvertSnootyPageIdToAtlasPageId(data.Data.PageID)
21-
pageUrl := utils.ConvertSnootyPageIdToProductionUrl(data.Data.PageID, siteUrl)
22-
product, subProduct := GetProductSubProduct(report.ProjectName, pageUrl)
19+
pageId := utils.ConvertSnootyPageIdToAtlasPageId(data.PageID)
20+
pageUrl := utils.ConvertSnootyPageIdToProductionUrl(data.PageID, siteUrl)
21+
product, subProduct := GetProductSubProduct(projectName, pageUrl)
2322
var isDriversProject bool
2423
if product == "Drivers" {
2524
isDriversProject = true
2625
} else {
2726
isDriversProject = false
2827
}
29-
newAppliedUsageExampleCount := 0
30-
var newCodeNodes []common.CodeNode
28+
29+
// Some of the new code examples coming in from the page may be duplicates. So we first make Sha256 hashes of the
30+
// incoming code examples, and count the number of times the hash appears on the page.
31+
snootySha256Hashes := make(map[string]int)
32+
snootySha256ToAstNodeMap := make(map[string]types.ASTNode)
33+
3134
for _, node := range incomingCodeNodes {
32-
newNode := snooty.MakeCodeNodeFromSnootyAST(node, llm, ctx, isDriversProject)
33-
newCodeNodes = append(newCodeNodes, newNode)
34-
if add_code_examples.IsNewAppliedUsageExample(newNode) {
35-
newAppliedUsageExampleCount++
35+
// This makes a hash from the whitespace-trimmed AST node. We trim whitespace on AST nodes before adding
36+
// them to the DB, so this ensures an incoming node hash can match a whitespace-trimmed existing node hash.
37+
hash := snooty.MakeSha256HashForCode(node.Value)
38+
39+
// Add the hash as an entry in the map, and increment its counter. If the hash does not already exist in the map,
40+
// this will create it. If it does already exist, this will just increment its counter.
41+
snootySha256Hashes[hash]++
42+
snootySha256ToAstNodeMap[hash] = node
43+
}
44+
45+
// Then, we go through the hashes, create the corresponding codeNodes, and set the `InstancesOnPage` if the example
46+
// appears more than once on the page.
47+
var newCodeNodes []common.CodeNode
48+
for hash, count := range snootySha256Hashes {
49+
newNode := snooty.MakeCodeNodeFromSnootyAST(snootySha256ToAstNodeMap[hash], llm, ctx, isDriversProject)
50+
if count > 1 {
51+
newNode.InstancesOnPage = count
3652
}
53+
newCodeNodes = append(newCodeNodes, newNode)
3754
}
38-
maybeKeywords := snooty.GetMetaKeywords(data.Data.AST.Children)
3955

40-
languagesArrayValues := MakeLanguagesArray(newCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes)
56+
maybeKeywords := snooty.GetMetaKeywords(data.AST.Children)
4157

42-
// Report relevant details for the new page
43-
report = UpdateProjectReportForNewPage(incomingCodeNodeCount, incomingLiteralIncludeNodeCount, incomingIoCodeNodeCount, len(newCodeNodes), newAppliedUsageExampleCount, pageId, report)
58+
languagesArrayValues := MakeLanguagesArray(newCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes)
4459

4560
return common.DocsPage{
4661
ID: pageId,
@@ -52,9 +67,9 @@ func MakeNewPage(data types.PageWrapper, siteUrl string, report types.ProjectRep
5267
LiteralIncludesTotal: incomingLiteralIncludeNodeCount,
5368
Nodes: &newCodeNodes,
5469
PageURL: pageUrl,
55-
ProjectName: report.ProjectName,
70+
ProjectName: projectName,
5671
Product: product,
5772
SubProduct: subProduct,
5873
Keywords: maybeKeywords,
59-
}, report
74+
}
6075
}

audit/gdcd/UpdateExistingPage.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ package main
33
import (
44
"common"
55
"context"
6-
add_code_examples "gdcd/add-code-examples"
7-
compare_code_examples "gdcd/compare-code-examples"
6+
"gdcd/add-code-examples"
7+
"gdcd/compare-code-examples"
88
"gdcd/db"
99
"gdcd/snooty"
1010
"gdcd/types"
@@ -17,14 +17,15 @@ import (
1717
func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, projectReport types.ProjectReport, llm *ollama.LLM, ctx context.Context) (*common.DocsPage, types.ProjectReport) {
1818
var existingCurrentCodeNodes []common.CodeNode
1919
var existingRemovedCodeNodes []common.CodeNode
20+
existingCodeNodeCount := 0
2021
// Some of the existing Nodes on the page could have been previously removed from the page. So we need to know which
2122
// nodes are "currently" on the page, and which nodes have already been removed. The ones that are "currently" on the
2223
// page should be used to compare code examples, but the ones that have already been removed from the page will be
2324
// appended to the Nodes array without changes after making all the other updates.
2425
if existingPage.Nodes != nil {
2526
existingCurrentCodeNodes, existingRemovedCodeNodes = db.GetCurrentRemovedAtlasCodeNodes(*existingPage.Nodes)
27+
existingCodeNodeCount = compare_code_examples.GetCodeNodeCount(*existingPage.Nodes)
2628
}
27-
existingCodeNodeCount := len(existingCurrentCodeNodes)
2829
incomingCodeNodes, incomingLiteralIncludeNodes, incomingIoCodeBlockNodes := snooty.GetCodeExamplesFromIncomingData(data.Data.AST)
2930
maybePageKeywords := snooty.GetMetaKeywords(data.Data.AST.Children)
3031
newAppliedUsageExampleCount := 0
@@ -75,7 +76,12 @@ func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, pr
7576
node.DateRemoved = time.Now()
7677
node.IsRemoved = true
7778
updatedCodeNodes = append(updatedCodeNodes, node)
78-
newRemovedNodeCount++
79+
if node.InstancesOnPage > 0 {
80+
newRemovedNodeCount += node.InstancesOnPage
81+
node.InstancesOnPage = 0
82+
} else {
83+
newRemovedNodeCount++
84+
}
7985
} else {
8086
updatedCodeNodes = append(updatedCodeNodes, node)
8187
}
@@ -123,7 +129,7 @@ func UpdateExistingPage(existingPage common.DocsPage, data types.PageWrapper, pr
123129
newAppliedUsageExampleCount++
124130
}
125131
}
126-
newCodeNodeCount := len(newCodeNodes)
132+
newCodeNodeCount := compare_code_examples.GetCodeNodeCount(newCodeNodes)
127133
updatedPage.Nodes = &newCodeNodes
128134

129135
// Update the AST code node count, io-block-count and literalinclude count
Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,43 @@
11
package main
22

33
import (
4+
"common"
5+
"gdcd/add-code-examples"
6+
compare_code_examples "gdcd/compare-code-examples"
47
"gdcd/types"
58
"gdcd/utils"
69
)
710

8-
func UpdateProjectReportForNewPage(incomingCodeNodeCount int, incomingLiteralIncludeNodeCount int, incomingIoCodeBlockNodeCount int, newCodeNodes int, newAppliedUsageExampleCount int, pageId string, report types.ProjectReport) types.ProjectReport {
9-
report.Counter.IncomingCodeNodesCount += incomingCodeNodeCount
10-
report.Counter.IncomingLiteralIncludeCount += incomingLiteralIncludeNodeCount
11-
report.Counter.IncomingIoCodeBlockCount += incomingIoCodeBlockNodeCount
12-
report.Counter.NewCodeNodesCount += newCodeNodes
13-
report.Counter.NewAppliedUsageExamplesCount += newAppliedUsageExampleCount
11+
func UpdateProjectReportForNewPage(page common.DocsPage, report types.ProjectReport) types.ProjectReport {
12+
report.Counter.IncomingCodeNodesCount += page.CodeNodesTotal
13+
report.Counter.IncomingLiteralIncludeCount += page.LiteralIncludesTotal
14+
report.Counter.IncomingIoCodeBlockCount += page.IoCodeBlocksTotal
15+
report.Counter.NewCodeNodesCount += page.CodeNodesTotal
1416
report.Counter.NewPagesCount += 1
15-
report = utils.ReportChanges(types.PageCreated, report, pageId)
16-
if newCodeNodes > 0 {
17-
report = utils.ReportChanges(types.CodeExampleCreated, report, pageId, newCodeNodes)
17+
report = utils.ReportChanges(types.PageCreated, report, page.ID)
18+
if page.CodeNodesTotal > 0 {
19+
report = utils.ReportChanges(types.CodeExampleCreated, report, page.ID, page.CodeNodesTotal)
20+
}
21+
22+
// Figure out how many of the page's code examples are new applied usage examples
23+
newAppliedUsageExampleCount := 0
24+
newCodeNodeCount := 0
25+
if page.Nodes != nil {
26+
for _, node := range *page.Nodes {
27+
if add_code_examples.IsNewAppliedUsageExample(node) {
28+
newAppliedUsageExampleCount++
29+
}
30+
}
31+
newCodeNodeCount = compare_code_examples.GetCodeNodeCount(*page.Nodes)
1832
}
33+
report.Counter.NewAppliedUsageExamplesCount += newAppliedUsageExampleCount
1934

2035
if newAppliedUsageExampleCount > 0 {
21-
report = utils.ReportChanges(types.AppliedUsageExampleAdded, report, pageId, newAppliedUsageExampleCount)
36+
report = utils.ReportChanges(types.AppliedUsageExampleAdded, report, page.ID, newAppliedUsageExampleCount)
2237
}
2338

24-
newCodeNodeCount := newCodeNodes
25-
if incomingCodeNodeCount != newCodeNodeCount {
26-
report = utils.ReportIssues(types.CodeNodeCountIssue, report, pageId, incomingCodeNodeCount, newCodeNodeCount)
39+
if page.CodeNodesTotal != newCodeNodeCount {
40+
report = utils.ReportIssues(types.CodeNodeCountIssue, report, page.ID, page.CodeNodesTotal, newCodeNodeCount)
2741
}
2842
return report
2943
}

0 commit comments

Comments
 (0)