mirror of
https://github.com/nim-lang/Nim.git
synced 2026-06-07 12:24:19 +00:00
Better doc search (#8260)
* Modified the doc generation to produce a custom data attribute to allow for better search functionality * Implemented fuzzy matching for the Nim Doc search instead of the simple regex match. * Fix to the WordBoundry state transition from code review with @Varriount. Also removed silly testing template that is no longer used. * Update fuzzysearch.nim * Update fuzzysearch.nim * Update fuzzysearch.nim * Update dochack.nim * Update dochack.nim
This commit is contained in:
@@ -449,10 +449,11 @@ proc generateSymbolIndex(symbols: seq[IndexEntry]): string =
|
||||
desc = if not symbols[j].linkDesc.isNil: symbols[j].linkDesc else: ""
|
||||
if desc.len > 0:
|
||||
result.addf("""<li><a class="reference external"
|
||||
title="$3" href="$1">$2</a></li>
|
||||
title="$3" data-doc-search-tag="$2" href="$1">$2</a></li>
|
||||
""", [url, text, desc])
|
||||
else:
|
||||
result.addf("""<li><a class="reference external" href="$1">$2</a></li>
|
||||
result.addf("""<li><a class="reference external"
|
||||
data-doc-search-tag="$2" href="$1">$2</a></li>
|
||||
""", [url, text])
|
||||
inc j
|
||||
result.add("</ul></dd>\n")
|
||||
@@ -493,6 +494,7 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
|
||||
# Build a list of levels and extracted titles to make processing easier.
|
||||
var
|
||||
titleRef: string
|
||||
titleTag: string
|
||||
levels: seq[tuple[level: int, text: string]]
|
||||
L = 0
|
||||
level = 1
|
||||
@@ -519,10 +521,12 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
|
||||
let link = entries[L].link
|
||||
if link.isDocumentationTitle:
|
||||
titleRef = link
|
||||
titleTag = levels[L].text
|
||||
else:
|
||||
result.add(level.indentToLevel(levels[L].level))
|
||||
result.add("<li><a href=\"" & link & "\">" &
|
||||
levels[L].text & "</a></li>\n")
|
||||
result.addf("""<li><a class="reference" data-doc-search-tag="$1" href="$2">
|
||||
$3</a></li>
|
||||
""", [titleTag & " : " & levels[L].text, link, levels[L].text])
|
||||
inc L
|
||||
result.add(level.indentToLevel(1) & "</ul>\n")
|
||||
assert(not titleRef.isNil,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
|
||||
|
||||
import karax
|
||||
import fuzzysearch
|
||||
|
||||
proc findNodeWith(x: Element; tag, content: cstring): Element =
|
||||
if x.nodeName == tag and x.textContent == content:
|
||||
@@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element =
|
||||
if ul.len != 0: result.add ul
|
||||
if result.len == 0: result = nil
|
||||
|
||||
proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
|
||||
{.emit: """
|
||||
var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
|
||||
return new RegExp("\\b" + escaped + "\\b").test(`a`);
|
||||
""".}
|
||||
#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
|
||||
#{.emit: """
|
||||
#var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
|
||||
#return new RegExp("\\b" + escaped + "\\b").test(`a`);
|
||||
#""".}
|
||||
|
||||
proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} =
|
||||
{.emit: """
|
||||
@@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element =
|
||||
|
||||
`stuff` = doc.documentElement;
|
||||
""".}
|
||||
db = stuff.getElementsByClass"reference external"
|
||||
db = stuff.getElementsByClass"reference"
|
||||
contents = @[]
|
||||
for ahref in db:
|
||||
contents.add ahref.textContent.normalize
|
||||
contents.add ahref.getAttribute("data-doc-search-tag")
|
||||
let ul = tree("UL")
|
||||
result = tree("DIV")
|
||||
result.setClass"search_results"
|
||||
var matches: seq[(Element, int)] = @[]
|
||||
let key = value.normalize
|
||||
for i in 0..<db.len:
|
||||
let c = contents[i]
|
||||
if c.containsWord(key):
|
||||
matches.add((db[i], -(30_000 - c.len)))
|
||||
elif c.contains(key):
|
||||
matches.add((db[i], c.len))
|
||||
if c == "Examples" or c == "PEG construction":
|
||||
# Some manual exclusions.
|
||||
# Ideally these should be fixed in the index to be more
|
||||
# descriptive of what they are.
|
||||
continue
|
||||
let (score, matched) = fuzzymatch(value, c)
|
||||
if matched:
|
||||
matches.add((db[i], score))
|
||||
|
||||
matches.sort do (a, b: auto) -> int:
|
||||
a[1] - b[1]
|
||||
for i in 0..min(<matches.len, 19):
|
||||
b[1] - a[1]
|
||||
for i in 0 ..< min(matches.len, 19):
|
||||
matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag")
|
||||
ul.add(tree("LI", matches[i][0]))
|
||||
if ul.len == 0:
|
||||
result.add tree("B", text"no search results")
|
||||
|
||||
139
tools/dochack/fuzzysearch.nim
Normal file
139
tools/dochack/fuzzysearch.nim
Normal file
@@ -0,0 +1,139 @@
|
||||
# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
|
||||
# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
|
||||
# Heavily modified to provide more subjectively useful results
|
||||
# for on the Nim manual.
|
||||
#
|
||||
import strutils
|
||||
import math
|
||||
import macros
|
||||
|
||||
|
||||
const
|
||||
MaxUnmatchedLeadingChar = 3
|
||||
## Maximum number of times the penalty for unmatched leading chars is applied.
|
||||
|
||||
HeadingScaleFactor = 0.5
|
||||
## The score from before the colon Char is multiplied by this.
|
||||
## This is to weight function signatures and descriptions over module titles.
|
||||
|
||||
|
||||
type
|
||||
ScoreCard = enum
|
||||
StartMatch = -100 ## Start matching.
|
||||
LeadingCharDiff = -3 ## An unmatched, leading character was found.
|
||||
CharDiff = -1 ## An unmatched character was found.
|
||||
CharMatch = 0 ## A matched character was found.
|
||||
ConsecutiveMatch = 5 ## A consecutive match was found.
|
||||
LeadingCharMatch = 10 ## The character matches the begining of the
|
||||
## string or the first character of a word
|
||||
## or camel case boundry.
|
||||
WordBoundryMatch = 20 ## The last ConsecutiveCharMatch that
|
||||
## immediately precedes the end of the string,
|
||||
## end of the pattern, or a LeadingCharMatch.
|
||||
|
||||
|
||||
proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
|
||||
var
|
||||
scoreState = StartMatch
|
||||
headerMatched = false
|
||||
unmatchedLeadingCharCount = 0
|
||||
consecutiveMatchCount = 0
|
||||
strIndex = 0
|
||||
patIndex = 0
|
||||
score = 0
|
||||
|
||||
template transition(nextState) =
|
||||
scoreState = nextState
|
||||
score += ord(scoreState)
|
||||
|
||||
while (strIndex < str.len) and (patIndex < pattern.len):
|
||||
var
|
||||
patternChar = pattern[patIndex].toLowerAscii
|
||||
strChar = str[strIndex].toLowerAscii
|
||||
|
||||
# Ignore certain characters
|
||||
if patternChar in {'_', ' ', '.'}:
|
||||
patIndex += 1
|
||||
continue
|
||||
if strChar in {'_', ' ', '.'}:
|
||||
strIndex += 1
|
||||
continue
|
||||
|
||||
# Since this algorithm will be used to search against Nim documentation,
|
||||
# the below logic prioritizes headers.
|
||||
if not headerMatched and strChar == ':':
|
||||
headerMatched = true
|
||||
scoreState = StartMatch
|
||||
score = toInt(floor(HeadingScaleFactor * toFloat(score)))
|
||||
patIndex = 0
|
||||
strIndex += 1
|
||||
continue
|
||||
|
||||
if strChar == patternChar:
|
||||
case scoreState
|
||||
of StartMatch, WordBoundryMatch:
|
||||
scoreState = LeadingCharMatch
|
||||
|
||||
of CharMatch:
|
||||
transition(ConsecutiveMatch)
|
||||
|
||||
of LeadingCharMatch, ConsecutiveMatch:
|
||||
consecutiveMatchCount += 1
|
||||
scoreState = ConsecutiveMatch
|
||||
score += ord(ConsecutiveMatch) * consecutiveMatchCount
|
||||
|
||||
if scoreState == LeadingCharMatch:
|
||||
score += ord(LeadingCharMatch)
|
||||
|
||||
var onBoundary = (patIndex == high(pattern))
|
||||
if not onBoundary:
|
||||
let
|
||||
nextPatternChar = toLowerAscii(pattern[patIndex + 1])
|
||||
nextStrChar = toLowerAscii(str[strIndex + 1])
|
||||
|
||||
onBoundary = (
|
||||
nextStrChar notin {'a'..'z'} and
|
||||
nextStrChar != nextPatternChar
|
||||
)
|
||||
|
||||
if onBoundary:
|
||||
transition(WordBoundryMatch)
|
||||
|
||||
of CharDiff, LeadingCharDiff:
|
||||
var isLeadingChar = (
|
||||
str[strIndex - 1] notin Letters or
|
||||
str[strIndex - 1] in {'a'..'z'} and
|
||||
str[strIndex] in {'A'..'Z'}
|
||||
)
|
||||
|
||||
if isLeadingChar:
|
||||
scoreState = LeadingCharMatch
|
||||
#a non alpha or a camel case transition counts as a leading char.
|
||||
# Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
|
||||
else:
|
||||
transition(CharMatch)
|
||||
patIndex += 1
|
||||
|
||||
else:
|
||||
case scoreState
|
||||
of StartMatch:
|
||||
transition(LeadingCharDiff)
|
||||
|
||||
of ConsecutiveMatch:
|
||||
transition(CharDiff)
|
||||
consecutiveMatchCount = 0
|
||||
|
||||
of LeadingCharDiff:
|
||||
if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
|
||||
transition(LeadingCharDiff)
|
||||
unmatchedLeadingCharCount += 1
|
||||
|
||||
else:
|
||||
transition(CharDiff)
|
||||
|
||||
strIndex += 1
|
||||
|
||||
result = (
|
||||
score: max(0, score),
|
||||
matched: (score > 0),
|
||||
)
|
||||
Reference in New Issue
Block a user