Better doc search (#8260)

* Modified the doc generation to produce a custom data attribute to allow for better search functionality

* Implemented fuzzy matching for the Nim Doc search instead of the simple regex match.

* Fix to the WordBoundry state transition from code review with @Varriount. Also removed silly testing template that is no longer used.

* Update fuzzysearch.nim

* Update fuzzysearch.nim

* Update fuzzysearch.nim

* Update dochack.nim

* Update dochack.nim
This commit is contained in:
Ray Imber
2018-07-20 02:58:42 -07:00
committed by Varriount
parent f92d61b1f4
commit 060871e64a
3 changed files with 167 additions and 20 deletions

View File

@@ -449,10 +449,11 @@ proc generateSymbolIndex(symbols: seq[IndexEntry]): string =
desc = if not symbols[j].linkDesc.isNil: symbols[j].linkDesc else: ""
if desc.len > 0:
result.addf("""<li><a class="reference external"
title="$3" href="$1">$2</a></li>
title="$3" data-doc-search-tag="$2" href="$1">$2</a></li>
""", [url, text, desc])
else:
result.addf("""<li><a class="reference external" href="$1">$2</a></li>
result.addf("""<li><a class="reference external"
data-doc-search-tag="$2" href="$1">$2</a></li>
""", [url, text])
inc j
result.add("</ul></dd>\n")
@@ -493,6 +494,7 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
# Build a list of levels and extracted titles to make processing easier.
var
titleRef: string
titleTag: string
levels: seq[tuple[level: int, text: string]]
L = 0
level = 1
@@ -519,10 +521,12 @@ proc generateDocumentationTOC(entries: seq[IndexEntry]): string =
let link = entries[L].link
if link.isDocumentationTitle:
titleRef = link
titleTag = levels[L].text
else:
result.add(level.indentToLevel(levels[L].level))
result.add("<li><a href=\"" & link & "\">" &
levels[L].text & "</a></li>\n")
result.addf("""<li><a class="reference" data-doc-search-tag="$1" href="$2">
$3</a></li>
""", [titleTag & " : " & levels[L].text, link, levels[L].text])
inc L
result.add(level.indentToLevel(1) & "</ul>\n")
assert(not titleRef.isNil,

View File

@@ -1,6 +1,5 @@
import karax
import fuzzysearch
proc findNodeWith(x: Element; tag, content: cstring): Element =
if x.nodeName == tag and x.textContent == content:
@@ -88,11 +87,11 @@ proc toHtml(x: TocEntry; isRoot=false): Element =
if ul.len != 0: result.add ul
if result.len == 0: result = nil
proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
{.emit: """
var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
return new RegExp("\\b" + escaped + "\\b").test(`a`);
""".}
#proc containsWord(a, b: cstring): bool {.asmNoStackFrame.} =
#{.emit: """
#var escaped = `b`.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
#return new RegExp("\\b" + escaped + "\\b").test(`a`);
#""".}
proc isWhitespace(text: cstring): bool {.asmNoStackFrame.} =
{.emit: """
@@ -252,24 +251,29 @@ proc dosearch(value: cstring): Element =
`stuff` = doc.documentElement;
""".}
db = stuff.getElementsByClass"reference external"
db = stuff.getElementsByClass"reference"
contents = @[]
for ahref in db:
contents.add ahref.textContent.normalize
contents.add ahref.getAttribute("data-doc-search-tag")
let ul = tree("UL")
result = tree("DIV")
result.setClass"search_results"
var matches: seq[(Element, int)] = @[]
let key = value.normalize
for i in 0..<db.len:
let c = contents[i]
if c.containsWord(key):
matches.add((db[i], -(30_000 - c.len)))
elif c.contains(key):
matches.add((db[i], c.len))
if c == "Examples" or c == "PEG construction":
# Some manual exclusions.
# Ideally these should be fixed in the index to be more
# descriptive of what they are.
continue
let (score, matched) = fuzzymatch(value, c)
if matched:
matches.add((db[i], score))
matches.sort do (a, b: auto) -> int:
a[1] - b[1]
for i in 0..min(<matches.len, 19):
b[1] - a[1]
for i in 0 ..< min(matches.len, 19):
matches[i][0].innerHTML = matches[i][0].getAttribute("data-doc-search-tag")
ul.add(tree("LI", matches[i][0]))
if ul.len == 0:
result.add tree("B", text"no search results")

View File

@@ -0,0 +1,139 @@
# A Fuzzy Match implementation inspired by the sublime text fuzzy match algorithm
# as described here: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
# Heavily modified to provide more subjectively useful results
# for on the Nim manual.
#
import strutils
import math
import macros
const
MaxUnmatchedLeadingChar = 3
## Maximum number of times the penalty for unmatched leading chars is applied.
HeadingScaleFactor = 0.5
## The score from before the colon Char is multiplied by this.
## This is to weight function signatures and descriptions over module titles.
type
ScoreCard = enum
StartMatch = -100 ## Start matching.
LeadingCharDiff = -3 ## An unmatched, leading character was found.
CharDiff = -1 ## An unmatched character was found.
CharMatch = 0 ## A matched character was found.
ConsecutiveMatch = 5 ## A consecutive match was found.
LeadingCharMatch = 10 ## The character matches the begining of the
## string or the first character of a word
## or camel case boundry.
WordBoundryMatch = 20 ## The last ConsecutiveCharMatch that
## immediately precedes the end of the string,
## end of the pattern, or a LeadingCharMatch.
proc fuzzyMatch*(pattern, str: cstring) : tuple[score: int, matched: bool] =
var
scoreState = StartMatch
headerMatched = false
unmatchedLeadingCharCount = 0
consecutiveMatchCount = 0
strIndex = 0
patIndex = 0
score = 0
template transition(nextState) =
scoreState = nextState
score += ord(scoreState)
while (strIndex < str.len) and (patIndex < pattern.len):
var
patternChar = pattern[patIndex].toLowerAscii
strChar = str[strIndex].toLowerAscii
# Ignore certain characters
if patternChar in {'_', ' ', '.'}:
patIndex += 1
continue
if strChar in {'_', ' ', '.'}:
strIndex += 1
continue
# Since this algorithm will be used to search against Nim documentation,
# the below logic prioritizes headers.
if not headerMatched and strChar == ':':
headerMatched = true
scoreState = StartMatch
score = toInt(floor(HeadingScaleFactor * toFloat(score)))
patIndex = 0
strIndex += 1
continue
if strChar == patternChar:
case scoreState
of StartMatch, WordBoundryMatch:
scoreState = LeadingCharMatch
of CharMatch:
transition(ConsecutiveMatch)
of LeadingCharMatch, ConsecutiveMatch:
consecutiveMatchCount += 1
scoreState = ConsecutiveMatch
score += ord(ConsecutiveMatch) * consecutiveMatchCount
if scoreState == LeadingCharMatch:
score += ord(LeadingCharMatch)
var onBoundary = (patIndex == high(pattern))
if not onBoundary:
let
nextPatternChar = toLowerAscii(pattern[patIndex + 1])
nextStrChar = toLowerAscii(str[strIndex + 1])
onBoundary = (
nextStrChar notin {'a'..'z'} and
nextStrChar != nextPatternChar
)
if onBoundary:
transition(WordBoundryMatch)
of CharDiff, LeadingCharDiff:
var isLeadingChar = (
str[strIndex - 1] notin Letters or
str[strIndex - 1] in {'a'..'z'} and
str[strIndex] in {'A'..'Z'}
)
if isLeadingChar:
scoreState = LeadingCharMatch
#a non alpha or a camel case transition counts as a leading char.
# Transition the state, but don't give the bonus yet; wait until we verify a consecutive match.
else:
transition(CharMatch)
patIndex += 1
else:
case scoreState
of StartMatch:
transition(LeadingCharDiff)
of ConsecutiveMatch:
transition(CharDiff)
consecutiveMatchCount = 0
of LeadingCharDiff:
if unmatchedLeadingCharCount < MaxUnmatchedLeadingChar:
transition(LeadingCharDiff)
unmatchedLeadingCharCount += 1
else:
transition(CharDiff)
strIndex += 1
result = (
score: max(0, score),
matched: (score > 0),
)