Better doc search (#8260)

* Modified the doc generation to produce a custom data attribute to allow for better search functionality * Implemented fuzzy matching for the Nim Doc search instead of the simple regex match. * Fix to the WordBoundry state transition from code review with @Varriount. Also removed silly testing template that is no longer used. * Update fuzzysearch.nim * Update fuzzysearch.nim * Update fuzzysearch.nim * Update dochack.nim * Update dochack.nim
2026-07-28 11:26:39 +00:00 · 2018-07-20 02:58:42 -07:00
parent f92d61b1f4
commit 060871e64a
3 changed files with 167 additions and 20 deletions
--- a/lib/packages/docutils/rstgen.nim
+++ b/lib/packages/docutils/rstgen.nim
@@ -449,10 +449,11 @@ proc generateSymbolIndex(symbols: seq[IndexEntry]): string =
        desc = if not symbols[j].linkDesc.isNil: symbols[j].linkDesc else: ""
      if desc.len > 0:
        result.addf("""<li><a class="reference external"
-          title="$3" href="$1">$2</a></li>
+          title="$3" data-doc-search-tag="$2" href="$1">$2</a></li>
          """, [url, text, desc])
      else:
-        result.addf("""<li><a class="reference external" href="$1">$2</a></li>
+        result.addf("""<li><a class="reference external" 
+          data-doc-search-tag="$2" href="$1">$2</a></li>
          """, [url, text])
      inc j
    result.add("</ul></dd>\n")
@@ -493,6 +494,7 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
  # Build a list of levels and extracted titles to make processing easier.
  var
    titleRef: string
+    titleTag: string
    levels: seq[tuple[level: int, text: string]]
    L = 0
    level = 1
@@ -519,10 +521,12 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
    let link = entries[L].link
    if link.isDocumentationTitle:
      titleRef = link
+      titleTag = levels[L].text
    else:
      result.add(level.indentToLevel(levels[L].level))
-      result.add("<li><a href=\"" & link & "\">" &
-        levels[L].text & "</a></li>\n")
+      result.addf("""<li><a class="reference" data-doc-search-tag="$1" href="$2"> 
+        $3</a></li>
+        """, [titleTag & " : " & levels[L].text, link, levels[L].text])
    inc L
  result.add(level.indentToLevel(1) & "</ul>\n")
  assert(not titleRef.isNil,
--- a/tools/dochack/dochack.nim
+++ b/tools/dochack/dochack.nim
@@ -1,6 +1,5 @@
-
-
 import karax
+import fuzzysearch

 proc findNodeWith(x: Element; tag, content: cstring): Element =
  if x.nodeName == tag and x.textContent == content:
@@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element =
  if ul.len != 0: result.add ul
  if result.len == 0: result = nil

-proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
-  {.emit: """
-     var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
-     return new RegExp("\\b" + escaped + "\\b").test(`a`);
-  """.}
+#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
+  #{.emit: """
+     #var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
+     #return new RegExp("\\b" + escaped + "\\b").test(`a`);
+  #""".}

 proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} =
  {.emit: """
@@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element =

    `stuff` = doc.documentElement;
    """.}
-    db = stuff.getElementsByClass"reference external"
+    db = stuff.getElementsByClass"reference"
    contents = @[]
    for ahref in db:
-      contents.add ahref.textContent.normalize
+      contents.add ahref.getAttribute("data-doc-search-tag")
  let ul = tree("UL")
  result = tree("DIV")
  result.setClass"search_results"
  var matches: seq[(Element, int)] = @[]
-  let key = value.normalize
  for i in 0..<db.len:
    let c = contents[i]
-    if c.containsWord(key):
-      matches.add((db[i], -(30_000 - c.len)))
-    elif c.contains(key):
-      matches.add((db[i], c.len))
+    if c == "Examples" or c == "PEG construction":
+    # Some manual exclusions.
+    # Ideally these should be fixed in the index to be more
+    # descriptive of what they are.
+      continue
+    let (score, matched) = fuzzymatch(value, c)
+    if matched:
+      matches.add((db[i], score))
+
  matches.sort do (a, b: auto) -> int:
-    a[1] - b[1]
-  for i in 0..min(<matches.len, 19):
+    b[1] - a[1]
+  for i in 0 ..< min(matches.len, 19):
+    matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag")
    ul.add(tree("LI", matches[i][0]))
  if ul.len == 0:
    result.add tree("B", text"no search results")
--- a/tools/dochack/fuzzysearch.nim
+++ b/tools/dochack/fuzzysearch.nim
@@ -0,0 +1,139 @@
+# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
+# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
+# Heavily modified to provide more subjectively useful results
+# for on the Nim manual.
+#
+import strutils
+import math
+import macros
+
+
+const
+  MaxUnmatchedLeadingChar = 3
+  ## Maximum number of times the penalty for unmatched leading chars is applied.
+
+  HeadingScaleFactor = 0.5
+  ## The score from before the colon Char is multiplied by this.
+  ## This is to weight function signatures and descriptions over module titles.
+
+
+type 
+  ScoreCard = enum 
+    StartMatch           = -100 ## Start matching.
+    LeadingCharDiff      = -3   ## An unmatched, leading character was found.
+    CharDiff             = -1   ## An unmatched character was found.
+    CharMatch            = 0    ## A matched character was found.
+    ConsecutiveMatch     = 5    ## A consecutive match was found.
+    LeadingCharMatch     = 10   ## The character matches the begining of the
+                                ## string or the first character of a word
+                                ## or camel case boundry.
+    WordBoundryMatch     = 20   ## The last ConsecutiveCharMatch that
+                                ## immediately precedes the end of the string,
+                                ## end of the pattern, or a LeadingCharMatch.
+
+
+proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
+  var
+    scoreState = StartMatch
+    headerMatched = false
+    unmatchedLeadingCharCount = 0
+    consecutiveMatchCount = 0
+    strIndex = 0
+    patIndex = 0
+    score = 0
+
+  template transition(nextState) =
+    scoreState = nextState
+    score += ord(scoreState)
+
+  while (strIndex < str.len) and (patIndex < pattern.len):
+    var
+      patternChar = pattern[patIndex].toLowerAscii
+      strChar     = str[strIndex].toLowerAscii
+
+    # Ignore certain characters
+    if patternChar in {'_', ' ', '.'}:
+      patIndex += 1
+      continue
+    if strChar in {'_', ' ', '.'}:
+      strIndex += 1
+      continue
+    
+    # Since this algorithm will be used to search against Nim documentation,
+    # the below logic prioritizes headers.
+    if not headerMatched and strChar == ':':
+      headerMatched = true
+      scoreState = StartMatch
+      score = toInt(floor(HeadingScaleFactor * toFloat(score)))
+      patIndex = 0
+      strIndex += 1
+      continue
+
+    if strChar == patternChar:
+      case scoreState 
+      of StartMatch, WordBoundryMatch:
+        scoreState = LeadingCharMatch
+
+      of CharMatch:
+        transition(ConsecutiveMatch)
+
+      of LeadingCharMatch, ConsecutiveMatch:
+        consecutiveMatchCount += 1
+        scoreState = ConsecutiveMatch
+        score += ord(ConsecutiveMatch) * consecutiveMatchCount
+
+        if scoreState == LeadingCharMatch:
+          score += ord(LeadingCharMatch)
+          
+        var onBoundary = (patIndex == high(pattern))
+        if not onBoundary:
+          let
+            nextPatternChar = toLowerAscii(pattern[patIndex + 1])
+            nextStrChar     = toLowerAscii(str[strIndex + 1])
+
+          onBoundary = (
+            nextStrChar notin {'a'..'z'} and
+            nextStrChar != nextPatternChar
+          )
+        
+        if onBoundary:
+          transition(WordBoundryMatch)
+
+      of CharDiff, LeadingCharDiff:
+        var isLeadingChar = (
+          str[strIndex - 1] notin Letters or
+          str[strIndex - 1] in {'a'..'z'} and
+          str[strIndex] in {'A'..'Z'}
+        )
+
+        if isLeadingChar:
+          scoreState = LeadingCharMatch
+          #a non alpha or a camel case transition counts as a leading char.
+          # Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
+        else:
+          transition(CharMatch)
+      patIndex += 1
+
+    else:
+      case scoreState 
+      of StartMatch:
+        transition(LeadingCharDiff)
+
+      of ConsecutiveMatch:
+        transition(CharDiff)
+        consecutiveMatchCount = 0
+
+      of LeadingCharDiff:
+        if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
+          transition(LeadingCharDiff)
+        unmatchedLeadingCharCount += 1
+
+      else:
+        transition(CharDiff)
+
+    strIndex += 1
+
+  result = (
+    score:   max(0, score),
+    matched: (score > 0),
+  )