neovim/runtime/lua/vim/lsp/sync.lua

-- Notes on incremental sync:
--  Per the protocol, the text range should be:
--
--  A position inside a document (see Position definition below) is expressed as
--  a zero-based line and character offset. The offsets are based on a UTF-16
--  string representation. So a string of the form a𐐀b the character offset
--  of the character a is 0, the character offset of 𐐀 is 1 and the character
--  offset of b is 3 since 𐐀 is represented using two code units in UTF-16.
--
--  To ensure that both client and server split the string into the same line
--  representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’.
--
--  Positions are line end character agnostic. So you can not specify a position that
--  denotes \r|\n or \n| where | represents the character offset. This means *no* defining
--  a range than ends on the same line after a terminating character
--
-- Generic warnings about byte level changes in neovim. Many apparently "single"
-- operations in on_lines callbacks are actually multiple operations.
--
--  Join operation (2 operations):
--  * extends line 1 with the contents of line 2
--  * deletes line 2
--
--  test 1    test 1 test 2    test 1 test 2
--  test 2 -> test 2        -> test 3
--  test 3    test 3
--
--  Deleting (and undoing) two middle lines (1 operation):
--
--  test 1    test 1
--  test 2 -> test 4
--  test 3
--  test 4
--
--  Deleting partial lines (5 operations) deleting between asterisks below:
--
--  test *1   test *    test *     test *    test *4    test *4*
--  test 2 -> test 2 -> test *4 -> *4     -> *4      ->
--  test 3    test 3
--  test *4   test 4

local M = {}

-- local string.byte, unclear if this is necessary for JIT compilation
local str_byte = string.byte
local min = math.min
local str_utfindex = vim.str_utfindex
local str_utf_start = vim.str_utf_start
local str_utf_end = vim.str_utf_end

---@private
-- Given a line, byte idx, and offset_encoding convert to the
-- utf-8, utf-16, or utf-32 index.
---@param line string the line to index into
---@param byte integer the byte idx
---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
--@returns integer the utf idx for the given encoding
local function byte_to_utf(line, byte, offset_encoding)
  -- convert to 0 based indexing for str_utfindex
  byte = byte - 1

  local utf_idx
  local _
  -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
  if offset_encoding == 'utf-16' then
    _, utf_idx = str_utfindex(line, byte)
  elseif offset_encoding == 'utf-32' then
    utf_idx, _ = str_utfindex(line, byte)
  else
    utf_idx = byte
  end

  -- convert to 1 based indexing
  return utf_idx + 1
end

---@private
local function compute_line_length(line, offset_encoding)
  local length
  local _
  if offset_encoding == 'utf-16' then
     _, length = str_utfindex(line)
  elseif offset_encoding == 'utf-32' then
    length, _ = str_utfindex(line)
  else
    length = #line
  end
  return length
end

---@private
-- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
-- utf-8 index and either the utf-16, or utf-32 index.
---@param line string the line to index into
---@param byte integer the byte idx
---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
---@returns table<string, int> byte_idx and char_idx of first change position
local function align_end_position(line, byte, offset_encoding)
  local char
  -- If on the first byte, or an empty string: the trivial case
  if byte == 1 or #line == 0 then
    char = byte
  -- Called in the case of extending an empty line "" -> "a"
  elseif byte == #line + 1 then
    char = compute_line_length(line, offset_encoding) + 1
  else
    -- Modifying line, find the nearest utf codepoint
    local offset = str_utf_end(line, byte)
    -- If the byte does not fall on the start of the character, then
    -- align to the start of the next character.
    if offset > 0 then
      char = byte_to_utf(line, byte, offset_encoding) + 1
      byte = byte + offset
    else
      char = byte_to_utf(line, byte, offset_encoding)
      byte = byte + offset
    end
    -- Extending line, find the nearest utf codepoint for the last valid character
  end
  return byte, char
end

---@private
--- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint.
---@param prev_lines table list of lines from previous buffer
---@param curr_lines table list of lines from current buffer
---@param firstline integer firstline from on_lines, adjusted to 1-index
---@param lastline integer lastline from on_lines, adjusted to 1-index
---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index
---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8)
---@returns table<int, int> line_idx, byte_idx, and char_idx of first change position
local function compute_start_range(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding)
  -- If firstline == lastline, no existing text is changed. All edit operations
  -- occur on a new line pointed to by lastline. This occurs during insertion of
  -- new lines(O), the new newline is inserted at the line indicated by
  -- new_lastline.
  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  -- In this case, the first byte change is also at the first byte of firstline
  if firstline == new_lastline or firstline == lastline then
    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }
  end

  local prev_line = prev_lines[firstline]
  local curr_line = curr_lines[firstline]

  -- Iterate across previous and current line containing first change
  -- to find the first different byte.
  -- Note: *about -> a*about will register the second a as the first
  -- difference, regardless of edit since we do not receive the first
  -- column of the edit from on_lines.
  local start_byte_idx = 1
  for idx = 1, #prev_line + 1 do
    start_byte_idx = idx
    if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then
      break
    end
  end

  -- Convert byte to codepoint if applicable
  local char_idx
  local byte_idx
  if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1)then
    byte_idx = start_byte_idx
    char_idx = 1
  elseif start_byte_idx == #prev_line + 1 then
    byte_idx = start_byte_idx
    char_idx = compute_line_length(prev_line, offset_encoding)  + 1
  else
    byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
    char_idx = byte_to_utf(prev_line, start_byte_idx, offset_encoding)
  end

  -- Return the start difference (shared for new and prev lines)
  return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
end

---@private
--- Finds the last line and byte index of the differences between prev and current buffer.
--- Normalized to the next codepoint.
--- prev_end_range is the text range sent to the server representing the changed region.
--- curr_end_range is the text that should be collected and sent to the server.
--
---@param prev_lines table list of lines
---@param curr_lines table list of lines
---@param start_range table
---@param lastline integer
---@param new_lastline integer
---@param offset_encoding string
---@returns (int, int) end_line_idx and end_col_idx of range
local function compute_end_range(prev_lines, curr_lines, start_range, firstline, lastline, new_lastline, offset_encoding)
  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  -- In this case, the last_byte...
  if firstline == new_lastline then
      return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, { line_idx = firstline, byte_idx = 1, char_idx = 1 }
  end
  if firstline == lastline then
      return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, { line_idx = new_lastline - lastline + firstline, byte_idx = 1, char_idx = 1 }
  end
  -- Compare on last line, at minimum will be the start range
  local start_line_idx = start_range.line_idx

  -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines
  local prev_line_idx = lastline - 1
  local curr_line_idx = new_lastline - 1

  local prev_line = prev_lines[lastline - 1]
  local curr_line = curr_lines[new_lastline - 1]

  local prev_line_length = #prev_line
  local curr_line_length = #curr_line

  local byte_offset = 0

  -- Editing the same line
  -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
  if prev_line_idx == curr_line_idx then
    local max_length
    if start_line_idx == prev_line_idx then
      -- Search until beginning of difference
      max_length = min(prev_line_length - start_range.byte_idx, curr_line_length - start_range.byte_idx) + 1
    else
      max_length = min(prev_line_length, curr_line_length) + 1
    end
    for idx = 0, max_length do
      byte_offset = idx
      if
        str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset)
      then
        break
      end
    end
  end

  -- Iterate from end to beginning of shortest line
  local prev_end_byte_idx = prev_line_length - byte_offset + 1

  -- Handle case where lines match
  if prev_end_byte_idx == 0 then
    prev_end_byte_idx = 1
  end
  local prev_byte_idx, prev_char_idx = align_end_position(prev_line, prev_end_byte_idx, offset_encoding)
  local prev_end_range = { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }

  local curr_end_range
  -- Deletion event, new_range cannot be before start
  if curr_line_idx < start_line_idx then
    curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
  else
    local curr_end_byte_idx = curr_line_length - byte_offset + 1
    -- Handle case where lines match
    if curr_end_byte_idx == 0 then
      curr_end_byte_idx = 1
    end
    local curr_byte_idx, curr_char_idx = align_end_position(curr_line, curr_end_byte_idx, offset_encoding)
    curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
  end

  return prev_end_range, curr_end_range
end

---@private
--- Get the text of the range defined by start and end line/column
---@param lines table list of lines
---@param start_range table table returned by first_difference
---@param end_range table new_end_range returned by last_difference
---@returns string text extracted from defined region
local function extract_text(lines, start_range, end_range, line_ending)
    if not lines[start_range.line_idx] then
      return ""
    end
  -- Trivial case: start and end range are the same line, directly grab changed text
  if start_range.line_idx == end_range.line_idx then
    -- string.sub is inclusive, end_range is not
    return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1)

  else
    -- Handle deletion case
    -- Collect the changed portion of the first changed line
    local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) }

    -- Collect the full line for intermediate lines
    for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
      table.insert(result, lines[idx])
    end

    if lines[end_range.line_idx] then
      -- Collect the changed portion of the last changed line.
      table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1))
    else
      table.insert(result, "")
    end

    -- Add line ending between all lines
    return table.concat(result, line_ending)
  end
end

---@private
-- rangelength depends on the offset encoding
-- bytes for utf-8 (clangd with extenion)
-- codepoints for utf-16
-- codeunits for utf-32
-- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac)
-- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior)
local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending)
  local line_ending_length = #line_ending
  -- Single line case
  if start_range.line_idx == end_range.line_idx then
    return end_range.char_idx - start_range.char_idx
  end

  local start_line = lines[start_range.line_idx]
  local range_length
  if start_line and #start_line > 0 then
    range_length = compute_line_length(start_line, offset_encoding) - start_range.char_idx + 1 + line_ending_length
  else
    -- Length of newline character
    range_length = line_ending_length
  end

  -- The first and last range of the line idx may be partial lines
  for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
    -- Length full line plus newline character
    if #lines[idx] > 0 then
      range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
    else
      range_length = range_length + line_ending_length
    end
  end

  local end_line = lines[end_range.line_idx]
  if end_line and #end_line > 0 then
    range_length = range_length + end_range.char_idx - 1
  end

  return range_length
end

--- Returns the range table for the difference between prev and curr lines
---@param prev_lines table list of lines
---@param curr_lines table list of lines
---@param firstline number line to begin search for first difference
---@param lastline number line to begin search in old_lines for last difference
---@param new_lastline number line to begin search in new_lines for last difference
---@param offset_encoding string encoding requested by language server
---@returns table TextDocumentContentChangeEvent see https://microsoft.github.io/language-server-protocol/specifications/specification-3-17/#textDocumentContentChangeEvent
function M.compute_diff(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding, line_ending)
  -- Find the start of changes between the previous and current buffer. Common between both.
  -- Sent to the server as the start of the changed range.
  -- Used to grab the changed text from the latest buffer.
  local start_range = compute_start_range(
    prev_lines,
    curr_lines,
    firstline + 1,
    lastline + 1,
    new_lastline + 1,
    offset_encoding
  )
  -- Find the last position changed in the previous and current buffer.
  -- prev_end_range is sent to the server as as the end of the changed range.
  -- curr_end_range is used to grab the changed text from the latest buffer.
  local prev_end_range, curr_end_range = compute_end_range(
    prev_lines,
    curr_lines,
    start_range,
    firstline + 1,
    lastline + 1,
    new_lastline + 1,
    offset_encoding
  )

  -- Grab the changed text of from start_range to curr_end_range in the current buffer.
  -- The text range is "" if entire range is deleted.
  local text = extract_text(curr_lines, start_range, curr_end_range, line_ending)

  -- Compute the range of the replaced text. Deprecated but still required for certain language servers
  local range_length = compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending)

  -- convert to 0 based indexing
  local result = {
    range = {
      ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 },
      ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 },
    },
    text = text,
    rangeLength = range_length,
  }

  return result
end

return M