ghostty/pkg/simdutf/vendor/simdutf.cpp

/* auto-generated on 2026-04-21 21:46:47 -0400. Do not edit! */
/* begin file src/simdutf.cpp */
#include "simdutf.h"

/* begin file src/encoding_types.cpp */
namespace simdutf {
std::string_view to_string(encoding_type bom) {
  switch (bom) {
  case UTF16_LE:
    return "UTF16 little-endian";
  case UTF16_BE:
    return "UTF16 big-endian";
  case UTF32_LE:
    return "UTF32 little-endian";
  case UTF32_BE:
    return "UTF32 big-endian";
  case UTF8:
    return "UTF8";
  case unspecified:
    return "unknown";
  default:
    return "error";
  }
}

namespace BOM {
// Note that BOM for UTF8 is discouraged.
encoding_type check_bom(const uint8_t *byte, size_t length) {
  if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
    if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
      return encoding_type::UTF32_LE;
    } else {
      return encoding_type::UTF16_LE;
    }
  } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
    return encoding_type::UTF16_BE;
  } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
             byte[2] == 0xfe and byte[3] == 0xff) {
    return encoding_type::UTF32_BE;
  } else if (length >= 3 && byte[0] == 0xef and byte[1] == 0xbb and
             byte[2] == 0xbf) {
    return encoding_type::UTF8;
  }
  return encoding_type::unspecified;
}

encoding_type check_bom(const char *byte, size_t length) {
  return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
}

size_t bom_byte_size(encoding_type bom) {
  switch (bom) {
  case UTF16_LE:
    return 2;
  case UTF16_BE:
    return 2;
  case UTF32_LE:
    return 4;
  case UTF32_BE:
    return 4;
  case UTF8:
    return 3;
  case unspecified:
    return 0;
  default:
    return 0;
  }
}

} // namespace BOM
} // namespace simdutf
/* end file src/encoding_types.cpp */
/* begin file src/error.cpp */
namespace simdutf {
// deliberately empty
}
/* end file src/error.cpp */
// The large tables should be included once and they
// should not depend on a kernel.
/* begin file src/tables/utf8_to_utf16_tables.h */
#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
#include <cstdint>

namespace simdutf {
namespace {
namespace tables {
namespace utf8_to_utf16 {
/**
 * utf8bigindex uses about 8 kB
 * shufutf8 uses about 3344 B
 *
 * So we use a bit over 11 kB. It would be
 * easy to save about 4 kB by only
 * storing the index in utf8bigindex, and
 * deriving the consumed bytes otherwise.
 * However, this may come at a significant (10% to 20%)
 * performance penalty.
 */

const uint8_t shufutf8[209][16] = {
    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
    {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
    {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
    {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
    {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
    {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
    {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
    {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
    {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
    {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
    {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
    {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
/* number of two bytes : 64 */
/* number of two + three bytes : 145 */
/* number of two + three + four bytes : 209 */
const uint8_t utf8bigindex[4096][2] = {
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12},
    {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},
    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
    {164, 7},  {145, 3},  {209, 12}, {155, 7},  {167, 7},  {69, 7},   {179, 7},
    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},
    {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
    {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {156, 8},  {168, 8},  {146, 4},
    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {171, 8},
    {72, 8},   {183, 8},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
    {209, 12}, {174, 8},  {148, 6},  {186, 8},  {80, 8},   {98, 8},   {66, 6},
    {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},
    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},
    {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},
    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
    {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},
    {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},
    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {187, 9},  {81, 9},
    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {77, 7},   {95, 7},
    {7, 9},    {194, 7},  {83, 7},   {101, 7},  {11, 9},   {119, 7},  {19, 9},
    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {13, 9},
    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},
    {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},
    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
    {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10},
    {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},
    {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},
    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
    {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},
    {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10},
    {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},
    {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12},
    {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},
    {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},
    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},
    {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},
    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},
    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
    {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},
    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},
    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {15, 10},  {122, 8},  {23, 10},
    {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
    {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},
    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},
    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},
    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},
    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
    {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},
    {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10},
    {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},
    {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},
    {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},
    {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},
    {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},
    {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},
    {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
    {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},
    {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},
    {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},
    {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},
    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
    {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
    {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12},
    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},
    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
    {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},
    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
    {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12},
    {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},
    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
    {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},
    {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},
    {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},
    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {192, 11}, {152, 7},  {164, 7},  {145, 3},  {204, 11}, {155, 7},  {167, 7},
    {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
    {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},
    {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},
    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},
    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},
    {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
    {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},
    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {207, 11}, {156, 8},
    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
    {159, 8},  {117, 11}, {72, 8},   {135, 11}, {78, 8},   {96, 8},   {65, 5},
    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {141, 11}, {80, 8},
    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},
    {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
    {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},
    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},
    {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},
    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},
    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
    {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},
    {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},
    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},
    {143, 11}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},
    {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
    {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},
    {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},
    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},
    {31, 11},  {47, 11},  {7, 9},    {194, 7},  {83, 7},   {55, 11},  {11, 9},
    {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
    {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
    {59, 11},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12},
    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},
    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},
    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},
    {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
    {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},
    {86, 8},   {61, 11},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},
    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},
    {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},
    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
    {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},
    {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
    {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},
    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12},
    {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12},
    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
    {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},
    {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},
    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},
    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},
    {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12},
    {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},
    {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},
    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},
    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},
    {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
    {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},
    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10},
    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
    {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},
    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10},
    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {62, 11},  {15, 10},
    {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},
    {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},
    {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},
    {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},
    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},
    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
    {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},
    {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},
    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},
    {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},
    {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
    {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},
    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},
    {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},
    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},
    {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},
    {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12},
    {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
    {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},
    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},
    {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},
    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},
    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},
    {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
    {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},
    {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},
    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},
    {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},
    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
    {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},
    {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
    {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},
    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
    {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},
    {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},
    {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},
    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},
    {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {152, 7},  {164, 7},  {145, 3},  {209, 12},
    {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},
    {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},
    {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},
    {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},
    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},
    {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
    {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},
    {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
    {208, 12}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
    {64, 4},   {209, 12}, {159, 8},  {171, 8},  {72, 8},   {183, 8},  {78, 8},
    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
    {186, 8},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
    {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},
    {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},
    {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
    {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},
    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
    {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},
    {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},
    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
    {175, 9},  {148, 6},  {144, 12}, {81, 9},   {99, 9},   {66, 6},   {199, 9},
    {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},
    {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},
    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
    {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},
    {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},
    {71, 7},   {131, 9},  {77, 7},   {95, 7},   {7, 9},    {194, 7},  {83, 7},
    {101, 7},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12},
    {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},
    {197, 7},  {85, 7},   {103, 7},  {13, 9},   {121, 7},  {21, 9},   {37, 9},
    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},
    {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},
    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},
    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
    {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},
    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},
    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {14, 9},   {122, 8},  {22, 9},
    {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
    {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},
    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},
    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},
    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},
    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
    {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},
    {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10},
    {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},
    {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},
    {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},
    {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},
    {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},
    {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},
    {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
    {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},
    {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},
    {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},
    {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10},
    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
    {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
    {63, 12},  {15, 10},  {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12},
    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},
    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
    {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},
    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
    {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},
    {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},
    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
    {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},
    {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},
    {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},
    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},
    {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
    {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},
    {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},
    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},
    {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},
    {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
    {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},
    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},
    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
    {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},
    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},
    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},
    {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
    {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},
    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},
    {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},
    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},
    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},
    {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},
    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},
    {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},
    {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
    {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7},  {164, 7},
    {145, 3},  {204, 11}, {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},
    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},
    {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},
    {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
    {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
    {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12},
    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},
    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {145, 3},  {207, 11}, {156, 8},  {168, 8},  {146, 4},  {180, 8},
    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {117, 11}, {72, 8},
    {135, 11}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
    {174, 8},  {148, 6},  {141, 11}, {80, 8},   {98, 8},   {66, 6},   {198, 8},
    {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},
    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},
    {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},
    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
    {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},
    {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
    {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},
    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},
    {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},
    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
    {209, 12}, {209, 12}, {175, 9},  {148, 6},  {143, 11}, {81, 9},   {99, 9},
    {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},
    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},
    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},
    {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
    {158, 7},  {113, 9},  {71, 7},   {131, 9},  {31, 11},  {47, 11},  {7, 9},
    {194, 7},  {83, 7},   {55, 11},  {11, 9},   {119, 7},  {19, 9},   {35, 9},
    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},
    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {59, 11},  {13, 9},   {121, 7},
    {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
    {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},
    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},
    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
    {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},
    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},
    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {61, 11},  {14, 9},
    {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},
    {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},
    {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},
    {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},
    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},
    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10},
    {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},
    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
    {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10},
    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},
    {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
    {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},
    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},
    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10},
    {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
    {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},
    {198, 8},  {86, 8},   {62, 11},  {15, 10},  {122, 8},  {23, 10},  {39, 10},
    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},
    {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},
    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
    {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},
    {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
    {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},
    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},
    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},
    {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},
    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},
    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},
    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
    {0, 6}};
} // namespace utf8_to_utf16
} // namespace tables
} // unnamed namespace
} // namespace simdutf

#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
/* end file src/tables/utf8_to_utf16_tables.h */
/* begin file src/tables/utf16_to_utf8_tables.h */
// file generated by scripts/sse_convert_utf16_to_utf8.py
#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
#define SIMDUTF_UTF16_TO_UTF8_TABLES_H

namespace simdutf {
namespace {
namespace tables {
namespace utf16_to_utf8 {

// 1 byte for length, 16 bytes for mask
const uint8_t pack_1_2_utf8_bytes[256][17] = {
    {16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
    {15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
    {15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80},
    {14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
    {14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80},
    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
    {14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80},
    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
    {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
    {14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
    {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
    {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
    {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
    {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80}};

// 1 byte for length, 16 bytes for mask
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
    {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
    {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80}};

} // namespace utf16_to_utf8
} // namespace tables
} // unnamed namespace
} // namespace simdutf

#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
/* end file src/tables/utf16_to_utf8_tables.h */
/* begin file src/tables/utf32_to_utf16_tables.h */
// file generated by scripts/sse_convert_utf32_to_utf16.py
#ifndef SIMDUTF_UTF32_TO_UTF16_TABLES_H
#define SIMDUTF_UTF32_TO_UTF16_TABLES_H

namespace simdutf {
namespace {
namespace tables {
namespace utf32_to_utf16 {

const uint8_t pack_utf32_to_utf16le[16][16] = {
    {0, 1, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 0x80},
    {0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 0x80, 0x80},
    {0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80},
    {0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0x80, 0x80},
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
};

const uint8_t pack_utf32_to_utf16be[16][16] = {
    {1, 0, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 0x80, 0x80},
    {1, 0, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 13, 12, 15, 14, 0x80, 0x80},
    {1, 0, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {1, 0, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80},
    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
};

} // namespace utf32_to_utf16
} // namespace tables
} // unnamed namespace
} // namespace simdutf

#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
/* end file src/tables/utf32_to_utf16_tables.h */
// End of tables.

// Implementations: they need to be setup before including
// scalar/* code, as the scalar code is sometimes enabled
// only for peculiar build targets.

// The best choice should always come first!
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
SIMDUTF_DISABLE_UNUSED_WARNING
#endif
/* begin file src/simdutf/arm64.h */
#ifndef SIMDUTF_ARM64_H
#define SIMDUTF_ARM64_H

#ifdef SIMDUTF_FALLBACK_H
  #error "arm64.h must be included before fallback.h"
#endif


#ifndef SIMDUTF_IMPLEMENTATION_ARM64
  #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
#endif
#if SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
  #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 1
#else
  #define SIMDUTF_CAN_ALWAYS_RUN_ARM64 0
#endif


#if SIMDUTF_IMPLEMENTATION_ARM64

namespace simdutf {
/**
 * Implementation for NEON (ARMv8).
 */
namespace arm64 {} // namespace arm64
} // namespace simdutf

/* begin file src/simdutf/arm64/implementation.h */
#ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
#define SIMDUTF_ARM64_IMPLEMENTATION_H


namespace simdutf {
namespace arm64 {

namespace {
using namespace simdutf;
}

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("arm64", "ARM NEON",
                                internal::instruction_set::NEON) {}
  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override;
};

} // namespace arm64
} // namespace simdutf

#endif // SIMDUTF_ARM64_IMPLEMENTATION_H
/* end file src/simdutf/arm64/implementation.h */

/* begin file src/simdutf/arm64/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "arm64"
// #define SIMDUTF_IMPLEMENTATION arm64
/* end file src/simdutf/arm64/begin.h */

  // Declarations
/* begin file src/simdutf/arm64/intrinsics.h */
#ifndef SIMDUTF_ARM64_INTRINSICS_H
#define SIMDUTF_ARM64_INTRINSICS_H


// This should be the correct header whether
// you use visual studio or other compilers.
#include <arm_neon.h>

#endif //  SIMDUTF_ARM64_INTRINSICS_H
/* end file src/simdutf/arm64/intrinsics.h */
/* begin file src/simdutf/arm64/bitmanipulation.h */
#ifndef SIMDUTF_ARM64_BITMANIPULATION_H
#define SIMDUTF_ARM64_BITMANIPULATION_H

namespace simdutf {
namespace arm64 {
namespace {

/* result might be undefined when input_num is zero */
simdutf_really_inline int count_ones(uint64_t input_num) {
  return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
}

#if SIMDUTF_NEED_TRAILING_ZEROES
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  unsigned long ret;
  // Search the mask data from least significant bit (LSB)
  // to the most significant bit (MSB) for a set bit (1).
  _BitScanForward64(&ret, input_num);
  return (int)ret;
  #else  // SIMDUTF_REGULAR_VISUAL_STUDIO
  return __builtin_ctzll(input_num);
  #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
}
#endif
template <typename T> T clear_least_significant_bit(T x) {
  return (x & (x - 1));
}

} // unnamed namespace
} // namespace arm64
} // namespace simdutf

#endif // SIMDUTF_ARM64_BITMANIPULATION_H
/* end file src/simdutf/arm64/bitmanipulation.h */
/* begin file src/simdutf/arm64/simd.h */
#ifndef SIMDUTF_ARM64_SIMD_H
#define SIMDUTF_ARM64_SIMD_H

#include <type_traits>

namespace simdutf {
namespace arm64 {
namespace {
namespace simd {

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
namespace {
  // Start of private section with Visual Studio workaround

  #ifndef simdutf_make_uint8x16_t
    #define simdutf_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,   \
                                    x11, x12, x13, x14, x15, x16)              \
      ([=]() {                                                                 \
        uint8_t array[16] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8,             \
                             x9, x10, x11, x12, x13, x14, x15, x16};           \
        return vld1q_u8(array);                                                \
      }())
  #endif
  #ifndef simdutf_make_int8x16_t
    #define simdutf_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,    \
                                   x11, x12, x13, x14, x15, x16)               \
      ([=]() {                                                                 \
        int8_t array[16] = {x1, x2,  x3,  x4,  x5,  x6,  x7,  x8,              \
                            x9, x10, x11, x12, x13, x14, x15, x16};            \
        return vld1q_s8(array);                                                \
      }())
  #endif

  #ifndef simdutf_make_uint8x8_t
    #define simdutf_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8)             \
      ([=]() {                                                                 \
        uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8};                   \
        return vld1_u8(array);                                                 \
      }())
  #endif
  #ifndef simdutf_make_int8x8_t
    #define simdutf_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8)              \
      ([=]() {                                                                 \
        int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8};                    \
        return vld1_s8(array);                                                 \
      }())
  #endif
  #ifndef simdutf_make_uint16x8_t
    #define simdutf_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8)            \
      ([=]() {                                                                 \
        uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8};                  \
        return vld1q_u16(array);                                               \
      }())
  #endif
  #ifndef simdutf_make_int16x8_t
    #define simdutf_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8)             \
      ([=]() {                                                                 \
        int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8};                   \
        return vld1q_s16(array);                                               \
      }())
  #endif

// End of private section with Visual Studio workaround
} // namespace
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO

template <typename T> struct simd8;

//
// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t
// internally.
//
template <typename T, typename Mask = simd8<bool>> struct base_u8 {
  uint8x16_t value;
  static const int SIZE = sizeof(value);
  void dump() const {
#ifdef SIMDUTF_LOGGING
    uint8_t temp[16];
    vst1q_u8(temp, *this);
    printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x,%04x, %04x, %04x, "
           "%04x, %04x, %04x, %04x, %04x]\n",
           temp[0], temp[1], temp[2], temp[3], temp[4], temp[5], temp[6],
           temp[7], temp[8], temp[9], temp[10], temp[11], temp[12], temp[13],
           temp[14], temp[15]);
#endif // SIMDUTF_LOGGING
  }
  // Conversion from/to SIMD register
  simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
  simdutf_really_inline operator const uint8x16_t &() const {
    return this->value;
  }

  // Bit operations
  simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
    return vorrq_u8(*this, other);
  }
  simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
    return vandq_u8(*this, other);
  }
  simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
    return veorq_u8(*this, other);
  }
  simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
    auto this_cast = static_cast<simd8<T> *>(this);
    *this_cast = *this_cast | other;
    return *this_cast;
  }

  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
                                               const simd8<T> rhs) {
    return vceqq_u8(lhs, rhs);
  }

  template <int N = 1>
  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
    return vextq_u8(prev_chunk, *this, 16 - N);
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base_u8<bool> {
  static simdutf_really_inline simd8<bool> splat(bool _value) {
    return vmovq_n_u8(uint8_t(-(!!_value)));
  }

  simdutf_really_inline simd8(const uint8x16_t _value)
      : base_u8<bool>(_value) {}
  // False constructor
  simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
  // Splat constructor
  simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
  simdutf_really_inline void store(uint8_t dst[16]) const {
    return vst1q_u8(dst, *this);
  }

  // We return uint32_t instead of uint16_t because that seems to be more
  // efficient for most purposes (cutting it down to uint16_t costs performance
  // in some compilers).
  simdutf_really_inline uint32_t to_bitmask() const {
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
    const uint8x16_t bit_mask =
        simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
#else
    const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
#endif
    auto minput = *this & bit_mask;
    uint8x16_t tmp = vpaddq_u8(minput, minput);
    tmp = vpaddq_u8(tmp, tmp);
    tmp = vpaddq_u8(tmp, tmp);
    return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
  }

  // Returns 4-bit out of each byte, alternating between the high 4 bits and low
  // bits result it is 64 bit. This method is expected to be faster than none()
  // and is equivalent when the vector register is the result of a comparison,
  // with byte values 0xff and 0x00.
  simdutf_really_inline uint64_t to_bitmask64() const {
    return vget_lane_u64(
        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
  }
};

// Unsigned bytes
template <> struct simd8<uint8_t> : base_u8<uint8_t> {
  static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
    return vmovq_n_u8(_value);
  }
  static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
  static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
    return vld1q_u8(values);
  }
  simdutf_really_inline simd8(const uint8x16_t _value)
      : base_u8<uint8_t>(_value) {}
  // Zero constructor
  simdutf_really_inline simd8() : simd8(zero()) {}
  // Array constructor
  simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
  // Splat constructor
  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
  // Member-by-member initialization
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
      : simd8(simdutf_make_uint8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
                                      v10, v11, v12, v13, v14, v15)) {}
#else
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
      : simd8(uint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
                         v13, v14, v15}) {}
#endif

  // Repeat 16 values as many times as necessary (usually for lookup tables)
  simdutf_really_inline static simd8<uint8_t>
  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
            uint8_t v15) {
    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
                          v13, v14, v15);
  }

  // Store to array
  simdutf_really_inline void store(uint8_t dst[16]) const {
    return vst1q_u8(dst, *this);
  }

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd8<uint8_t>
  operator-(const simd8<uint8_t> other) const {
    return vsubq_u8(*this, other);
  }
  simdutf_really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other) {
    *this = *this - other;
    return *this;
  }

  // Order-specific operations
  simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
  simdutf_really_inline simd8<bool>
  operator>=(const simd8<uint8_t> other) const {
    return vcgeq_u8(*this, other);
  }
  simdutf_really_inline simd8<bool>
  operator>(const simd8<uint8_t> other) const {
    return vcgtq_u8(*this, other);
  }
  // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
  // = nonzero. For ARM, returns all 1's.
  simdutf_really_inline simd8<uint8_t>
  gt_bits(const simd8<uint8_t> other) const {
    return simd8<uint8_t>(*this > other);
  }

  // Bit-specific operations
  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
    return vtstq_u8(*this, bits);
  }

  simdutf_really_inline bool is_ascii() const {
    return this->max_val() < 0b10000000u;
  }

  simdutf_really_inline bool any_bits_set_anywhere() const {
    return this->max_val() != 0;
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
    return vshrq_n_u8(*this, N);
  }
  simdutf_really_inline uint16_t sum_bytes() const { return vaddvq_u8(*this); }

  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
  // for out of range values)
  template <typename L>
  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
    return lookup_table.apply_lookup_16_to(*this);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
            L replace5, L replace6, L replace7, L replace8, L replace9,
            L replace10, L replace11, L replace12, L replace13, L replace14,
            L replace15) const {
    return lookup_16(simd8<L>::repeat_16(
        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
        replace7, replace8, replace9, replace10, replace11, replace12,
        replace13, replace14, replace15));
  }

  template <typename T>
  simdutf_really_inline simd8<uint8_t>
  apply_lookup_16_to(const simd8<T> original) const {
    return vqtbl1q_u8(*this, simd8<uint8_t>(original));
  }
};

// Signed bytes
template <> struct simd8<int8_t> {
  int8x16_t value;
  static const int SIZE = sizeof(value);

  static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
    return vmovq_n_s8(_value);
  }
  static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
  static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
    return vld1q_s8(values);
  }

  // Use ST2 instead of UXTL+UXTL2 to interleave zeroes. UXTL is actually a
  // USHLL #0, and shifting in NEON is actually quite slow.
  //
  // While this needs the registers to be in a specific order, bigger cores can
  // interleave these with no overhead, and it still performs decently on little
  // cores.
  //    movi  v1.3d, #0
  //      mov   v0.16b, value[0]
  //    st2   {v0.16b, v1.16b}, [ptr], #32
  //      mov   v0.16b, value[1]
  //    st2   {v0.16b, v1.16b}, [ptr], #32
  //    ...
  template <endianness big_endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
    constexpr auto matches = match_system(big_endian);
    const int8x16x2_t pair = matches
                                 ? int8x16x2_t{{this->value, vmovq_n_s8(0)}}
                                 : int8x16x2_t{{vmovq_n_s8(0), this->value}};
    vst2q_s8(reinterpret_cast<int8_t *>(p), pair);
  }

  // In places where the table can be reused, which is most uses in simdutf, it
  // is worth it to do 4 table lookups, as there is no direct zero extension
  // from u8 to u32.
  simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
    const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
                             2, 255, 255, 255, 3, 255, 255, 255};
    const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
                             6, 255, 255, 255, 7, 255, 255, 255};
    const simd8<uint8_t> tb3{8,  255, 255, 255, 9,  255, 255, 255,
                             10, 255, 255, 255, 11, 255, 255, 255};
    const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
                             14, 255, 255, 255, 15, 255, 255, 255};

    // encourage store pairing and interleaving
    const auto shuf1 = this->apply_lookup_16_to(tb1);
    const auto shuf2 = this->apply_lookup_16_to(tb2);
    shuf1.store(reinterpret_cast<int8_t *>(p));
    shuf2.store(reinterpret_cast<int8_t *>(p + 4));

    const auto shuf3 = this->apply_lookup_16_to(tb3);
    const auto shuf4 = this->apply_lookup_16_to(tb4);
    shuf3.store(reinterpret_cast<int8_t *>(p + 8));
    shuf4.store(reinterpret_cast<int8_t *>(p + 12));
  }
  // Conversion from/to SIMD register
  simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
  simdutf_really_inline operator const int8x16_t &() const {
    return this->value;
  }
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
  simdutf_really_inline operator const uint8x16_t() const {
    return vreinterpretq_u8_s8(this->value);
  }
#endif
  simdutf_really_inline operator int8x16_t &() { return this->value; }

  // Zero constructor
  simdutf_really_inline simd8() : simd8(zero()) {}
  // Splat constructor
  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
  // Member-by-member initialization
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
                              int8_t v4, int8_t v5, int8_t v6, int8_t v7,
                              int8_t v8, int8_t v9, int8_t v10, int8_t v11,
                              int8_t v12, int8_t v13, int8_t v14, int8_t v15)
      : simd8(simdutf_make_int8x16_t(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
                                     v10, v11, v12, v13, v14, v15)) {}
#else
  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
                              int8_t v4, int8_t v5, int8_t v6, int8_t v7,
                              int8_t v8, int8_t v9, int8_t v10, int8_t v11,
                              int8_t v12, int8_t v13, int8_t v14, int8_t v15)
      : simd8(int8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
                        v13, v14, v15}) {}
#endif

  // Store to array
  simdutf_really_inline void store(int8_t dst[16]) const {
    return vst1q_s8(dst, value);
  }
  // Explicit conversion to/from unsigned
  //
  // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same
  // type. In theory, we could check this occurrence with std::same_as and
  // std::enabled_if but it is C++14 and relatively ugly and hard to read.
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
  simdutf_really_inline explicit simd8(const uint8x16_t other)
      : simd8(vreinterpretq_s8_u8(other)) {}
#endif
  simdutf_really_inline operator simd8<uint8_t>() const {
    return vreinterpretq_u8_s8(this->value);
  }

  simdutf_really_inline simd8<int8_t>
  operator|(const simd8<int8_t> other) const {
    return vorrq_s8(value, other.value);
  }

  simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
  simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
  simdutf_really_inline bool is_ascii() const { return this->min_val() >= 0; }

  // Order-sensitive comparisons
  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
    return vcgtq_s8(value, other.value);
  }
  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
    return vcltq_s8(value, other.value);
  }

  template <typename T>
  simdutf_really_inline simd8<int8_t>
  apply_lookup_16_to(const simd8<T> original) const {
    return vqtbl1q_s8(*this, simd8<uint8_t>(original));
  }
};

template <typename T> struct simd8x64 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
  static_assert(NUM_CHUNKS == 4,
                "ARM kernel should use four registers per 64-byte block.");
  simd8<T> chunks[NUM_CHUNKS];

  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
  simd8x64<T> &
  operator=(const simd8<T> other) = delete; // no assignment allowed
  simd8x64() = delete;                      // no default constructor allowed

  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
                                 const simd8<T> chunk2, const simd8<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd8x64(const T *ptr)
      : chunks{simd8<T>::load(ptr),
               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
    this->chunks[0] |= other.chunks[0];
    this->chunks[1] |= other.chunks[1];
    this->chunks[2] |= other.chunks[2];
    this->chunks[3] |= other.chunks[3];
    return *this;
  }

  simdutf_really_inline simd8<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }

  template <endianness endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 0);
    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 1);
    this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 2);
    this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
    this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
    this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
    this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
    const uint8x16_t bit_mask =
        simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
#else
    const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
#endif
    // Add each of the elements next to each other, successively, to stuff each
    // 8 byte mask into one.
    uint8x16_t sum0 =
        vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask),
                  vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
    uint8x16_t sum1 =
        vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask),
                  vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
    sum0 = vpaddq_u8(sum0, sum1);
    sum0 = vpaddq_u8(sum0, sum0);
    return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
  }

  simdutf_really_inline uint64_t lt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
                          this->chunks[2] < mask, this->chunks[3] < mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
                          this->chunks[2] > mask, this->chunks[3] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
                          this->chunks[2] >= mask, this->chunks[3] >= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
    return simd8x64<bool>(simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
                          simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
                          simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
                          simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask)
        .to_bitmask();
  }
}; // struct simd8x64<T>
/* begin file src/simdutf/arm64/simd16-inl.h */
template <typename T> struct simd16;

template <typename T, typename Mask = simd16<bool>> struct base_u16 {
  uint16x8_t value;
  /// the size of vector in bytes
  static const int SIZE = sizeof(value);
  /// the number of elements of type T a vector can hold
  static const int ELEMENTS = SIZE / sizeof(T);
  // Conversion from/to SIMD register
  simdutf_really_inline base_u16() = default;
  simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
  simdutf_really_inline operator const uint16x8_t &() const {
    return this->value;
  }
  simdutf_really_inline operator uint16x8_t &() { return this->value; }
  // Bit operations
  simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
    return vorrq_u16(*this, other);
  }
  simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
    return vandq_u16(*this, other);
  }
  simdutf_really_inline simd16<T> operator^(const simd16<T> other) const {
    return veorq_u16(*this, other);
  }
  simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const {
    return vbicq_u16(*this, other);
  }
  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
  simdutf_really_inline simd16<T> &operator|=(const simd16<T> other) {
    auto this_cast = static_cast<simd16<T> *>(this);
    *this_cast = *this_cast | other;
    return *this_cast;
  }
  simdutf_really_inline simd16<T> &operator&=(const simd16<T> other) {
    auto this_cast = static_cast<simd16<T> *>(this);
    *this_cast = *this_cast & other;
    return *this_cast;
  }
  simdutf_really_inline simd16<T> &operator^=(const simd16<T> other) {
    auto this_cast = static_cast<simd16<T> *>(this);
    *this_cast = *this_cast ^ other;
    return *this_cast;
  }

  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
                                               const simd16<T> rhs) {
    return vceqq_u16(lhs, rhs);
  }

  template <int N = 1>
  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
    return vextq_u18(prev_chunk, *this, 8 - N);
  }
};

template <typename T, typename Mask = simd16<bool>>
struct base16 : base_u16<T> {
  typedef uint16_t bitmask_t;
  typedef uint32_t bitmask2_t;

  simdutf_really_inline base16() : base_u16<T>() {}
  simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
  template <typename Pointer>
  simdutf_really_inline base16(const Pointer *ptr) : base16(vld1q_u16(ptr)) {}

  static const int SIZE = sizeof(base_u16<T>::value);
  void dump() const {
#ifdef SIMDUTF_LOGGING
    uint16_t temp[8];
    vst1q_u16(temp, *this);
    printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0],
           temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]);
#endif // SIMDUTF_LOGGING
  }
  template <int N = 1>
  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
    return vextq_u18(prev_chunk, *this, 8 - N);
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd16<bool> : base16<bool> {
  static simdutf_really_inline simd16<bool> splat(bool _value) {
    return vmovq_n_u16(uint16_t(-(!!_value)));
  }

  simdutf_really_inline simd16() : base16() {}
  simdutf_really_inline simd16(const uint16x8_t _value)
      : base16<bool>(_value) {}
  // Splat constructor
  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
};

template <typename T> struct base16_numeric : base16<T> {
  static simdutf_really_inline simd16<T> splat(T _value) {
    return vmovq_n_u16(_value);
  }
  static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
  static simdutf_really_inline simd16<T> load(const T values[8]) {
    return vld1q_u16(reinterpret_cast<const uint16_t *>(values));
  }

  simdutf_really_inline base16_numeric() : base16<T>() {}
  simdutf_really_inline base16_numeric(const uint16x8_t _value)
      : base16<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[8]) const {
    return vst1q_u16(dst, *this);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
    return vaddq_u16(*this, other);
  }
  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
    return vsubq_u16(*this, other);
  }
  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
    *this = *this + other;
    return *static_cast<simd16<T> *>(this);
  }
  simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
    *this = *this - other;
    return *static_cast<simd16<T> *>(this);
  }
};

// Signed code units
template <> struct simd16<int16_t> : base16_numeric<int16_t> {
  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
  simdutf_really_inline simd16(const uint16x8_t _value)
      : base16_numeric<int16_t>(_value) {}
#endif
  simdutf_really_inline simd16(const int16x8_t _value)
      : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}

  // Splat constructor
  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
  simdutf_really_inline operator simd16<uint16_t>() const;
  simdutf_really_inline operator const uint16x8_t &() const {
    return this->value;
  }
  simdutf_really_inline operator const int16x8_t() const {
    return vreinterpretq_s16_u16(this->value);
  }

  simdutf_really_inline int16_t max_val() const {
    return vmaxvq_s16(vreinterpretq_s16_u16(this->value));
  }
  simdutf_really_inline int16_t min_val() const {
    return vminvq_s16(vreinterpretq_s16_u16(this->value));
  }
  // Order-sensitive comparisons
  simdutf_really_inline simd16<int16_t>
  max_val(const simd16<int16_t> other) const {
    return vmaxq_s16(vreinterpretq_s16_u16(this->value),
                     vreinterpretq_s16_u16(other.value));
  }
  simdutf_really_inline simd16<int16_t>
  min_val(const simd16<int16_t> other) const {
    return vmaxq_s16(vreinterpretq_s16_u16(this->value),
                     vreinterpretq_s16_u16(other.value));
  }
  simdutf_really_inline simd16<bool>
  operator>(const simd16<int16_t> other) const {
    return vcgtq_s16(vreinterpretq_s16_u16(this->value),
                     vreinterpretq_s16_u16(other.value));
  }
  simdutf_really_inline simd16<bool>
  operator<(const simd16<int16_t> other) const {
    return vcltq_s16(vreinterpretq_s16_u16(this->value),
                     vreinterpretq_s16_u16(other.value));
  }
};

// Unsigned code units
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
  simdutf_really_inline simd16(const uint16x8_t _value)
      : base16_numeric<uint16_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}

  simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
  simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
  // Saturated math
  simdutf_really_inline simd16<uint16_t>
  saturating_add(const simd16<uint16_t> other) const {
    return vqaddq_u16(*this, other);
  }
  simdutf_really_inline simd16<uint16_t>
  saturating_sub(const simd16<uint16_t> other) const {
    return vqsubq_u16(*this, other);
  }

  // Order-specific operations
  simdutf_really_inline simd16<uint16_t>
  max_val(const simd16<uint16_t> other) const {
    return vmaxq_u16(*this, other);
  }
  simdutf_really_inline simd16<uint16_t>
  min_val(const simd16<uint16_t> other) const {
    return vminq_u16(*this, other);
  }
  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd16<uint16_t>
  gt_bits(const simd16<uint16_t> other) const {
    return this->saturating_sub(other);
  }
  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd16<uint16_t>
  lt_bits(const simd16<uint16_t> other) const {
    return other.saturating_sub(*this);
  }
  simdutf_really_inline simd16<bool>
  operator<=(const simd16<uint16_t> other) const {
    return vcleq_u16(*this, other);
  }
  simdutf_really_inline simd16<bool>
  operator>=(const simd16<uint16_t> other) const {
    return vcgeq_u16(*this, other);
  }
  simdutf_really_inline simd16<bool>
  operator>(const simd16<uint16_t> other) const {
    return vcgtq_u16(*this, other);
  }
  simdutf_really_inline simd16<bool>
  operator<(const simd16<uint16_t> other) const {
    return vcltq_u16(*this, other);
  }

  // Bit-specific operations
  simdutf_really_inline simd16<bool> bits_not_set() const {
    return *this == uint16_t(0);
  }
  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
    return simd16<uint16_t>(vshrq_n_u16(*this, N));
  }
  template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
    return simd16<uint16_t>(vshlq_n_u16(*this, N));
  }

  // Pack with the unsigned saturation of two uint16_t code units into single
  // uint8_t vector
  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
                                                   const simd16<uint16_t> &v1) {
    return vqmovn_high_u16(vqmovn_u16(v0), v1);
  }

  // Change the endianness
  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
    return vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(*this)));
  }

  void dump() const {
    uint16_t temp[8];
    vst1q_u16(temp, *this);
    printf("[%04x, %04x, %04x, %04x, %04x, %04x, %04x, %04x]\n", temp[0],
           temp[1], temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]);
  }

  simdutf_really_inline uint32_t sum() const { return vaddlvq_u16(value); }
};

simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
  return this->value;
}

template <typename T> struct simd16x32 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
  static_assert(NUM_CHUNKS == 4,
                "ARM kernel should use four registers per 64-byte block.");
  simd16<T> chunks[NUM_CHUNKS];

  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
  simd16x32<T> &
  operator=(const simd16<T> other) = delete; // no assignment allowed
  simd16x32() = delete;                      // no default constructor allowed

  simdutf_really_inline
  simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
            const simd16<T> chunk2, const simd16<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd16x32(const T *ptr)
      : chunks{simd16<T>::load(ptr),
               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline simd16<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }

  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
    this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
    this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
    const uint8x16_t bit_mask =
        simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
#else
    const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
#endif
    // Add each of the elements next to each other, successively, to stuff each
    // 8 byte mask into one.
    uint8x16_t sum0 = vpaddq_u8(
        vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)),
        vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
    uint8x16_t sum1 = vpaddq_u8(
        vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)),
        vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
    sum0 = vpaddq_u8(sum0, sum1);
    sum0 = vpaddq_u8(sum0, sum0);
    return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
  }

  simdutf_really_inline void swap_bytes() {
    this->chunks[0] = this->chunks[0].swap_bytes();
    this->chunks[1] = this->chunks[1].swap_bytes();
    this->chunks[2] = this->chunks[2].swap_bytes();
    this->chunks[3] = this->chunks[3].swap_bytes();
  }
  simdutf_really_inline uint64_t gt(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
                           this->chunks[2] > mask, this->chunks[3] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
                           this->chunks[2] >= mask, this->chunks[3] >= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t lteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
                           this->chunks[2] <= mask, this->chunks[3] <= mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
    const simd16<T> mask_low = simd16<T>::splat(low);
    const simd16<T> mask_high = simd16<T>::splat(high);
    return simd16x32<bool>(
               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
               (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
               (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
        .to_bitmask();
  }
}; // struct simd16x32<T>
template <>
simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
    const uint16_t low, const uint16_t high) const {
  const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
  const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
  simd16x32<uint16_t> x(simd16<uint16_t>((this->chunks[0] > mask_high) |
                                         (this->chunks[0] < mask_low)),
                        simd16<uint16_t>((this->chunks[1] > mask_high) |
                                         (this->chunks[1] < mask_low)),
                        simd16<uint16_t>((this->chunks[2] > mask_high) |
                                         (this->chunks[2] < mask_low)),
                        simd16<uint16_t>((this->chunks[3] > mask_high) |
                                         (this->chunks[3] < mask_low)));
  return x.to_bitmask();
}

simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
                                           simd16<uint16_t> b) {
  return vminq_u16(a.value, b.value);
}
/* end file src/simdutf/arm64/simd16-inl.h */
/* begin file src/simdutf/arm64/simd32-inl.h */
template <typename T> struct simd32;

template <> struct simd32<uint32_t> {
  static const size_t SIZE = sizeof(uint32x4_t);
  static const size_t ELEMENTS = SIZE / sizeof(uint32_t);

  uint32x4_t value;

  simdutf_really_inline simd32(const uint32x4_t v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd32(const Pointer *ptr)
      : value(vld1q_u32(reinterpret_cast<const uint32_t *>(ptr))) {}

  simdutf_really_inline uint64_t sum() const { return vaddvq_u32(value); }

  simdutf_really_inline simd32<uint32_t> swap_bytes() const {
    return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(value)));
  }

  template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
    return vshrq_n_u32(value, N);
  }

  template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
    return vshlq_n_u32(value, N);
  }

  void dump() const {
#ifdef SIMDUTF_LOGGING
    uint32_t temp[4];
    vst1q_u32(temp, value);
    printf("[%08x, %08x, %08x, %08x]\n", temp[0], temp[1], temp[2], temp[3]);
#endif // SIMDUTF_LOGGING
  }

  // operators
  simdutf_really_inline simd32 &operator+=(const simd32 other) {
    value = vaddq_u32(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd32<uint32_t> zero() {
    return vdupq_n_u32(0);
  }

  simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
    return vdupq_n_u32(v);
  }
};

//----------------------------------------------------------------------

template <> struct simd32<bool> {
  uint32x4_t value;

  simdutf_really_inline simd32(const uint32x4_t v) : value(v) {}

  simdutf_really_inline bool any() const { return vmaxvq_u32(value) != 0; }
};

//----------------------------------------------------------------------

template <typename T>
simdutf_really_inline simd32<T> operator|(const simd32<T> a,
                                          const simd32<T> b) {
  return vorrq_u32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> a,
                                           const simd32<uint32_t> b) {
  return vminq_u32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
                                           const simd32<uint32_t> b) {
  return vmaxq_u32(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
                                              uint32_t b) {
  return vceqq_u32(a.value, vdupq_n_u32(b));
}

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return vandq_u32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                 uint32_t b) {
  return vandq_u32(a.value, vdupq_n_u32(b));
}

simdutf_really_inline simd32<uint32_t> operator|(const simd32<uint32_t> a,
                                                 uint32_t b) {
  return vorrq_u32(a.value, vdupq_n_u32(b));
}

simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return vaddq_u32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator-(const simd32<uint32_t> a,
                                                 uint32_t b) {
  return vsubq_u32(a.value, vdupq_n_u32(b));
}

simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
                                              const simd32<uint32_t> b) {
  return vcgeq_u32(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
  return vmvnq_u32(v.value);
}

simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return vcgtq_u32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> select(const simd32<bool> cond,
                                              const simd32<uint32_t> v_true,
                                              const simd32<uint32_t> v_false) {
  return vbslq_u32(cond.value, v_true.value, v_false.value);
}
/* end file src/simdutf/arm64/simd32-inl.h */
/* begin file src/simdutf/arm64/simd64-inl.h */
template <typename T> struct simd64;

template <> struct simd64<uint64_t> {
  uint64x2_t value;

  simdutf_really_inline simd64(const uint64x2_t v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd64(const Pointer *ptr)
      : value(vld1q_u64(reinterpret_cast<const uint64_t *>(ptr))) {}

  simdutf_really_inline uint64_t sum() const { return vaddvq_u64(value); }

  // operators
  simdutf_really_inline simd64 &operator+=(const simd64 other) {
    value = vaddq_u64(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd64<uint64_t> zero() {
    return vdupq_n_u64(0);
  }

  simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
    return vdupq_n_u64(v);
  }
};
/* end file src/simdutf/arm64/simd64-inl.h */

simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
  // We do it as 3 instructions. There might be a faster way.
  // We hope that these 3 instructions are cheap.
  uint16x8_t first_sum = vpaddlq_u8(v);
  uint32x4_t second_sum = vpaddlq_u16(first_sum);
  return vpaddlq_u32(second_sum);
}

} // namespace simd
} // unnamed namespace
} // namespace arm64
} // namespace simdutf

#endif // SIMDUTF_ARM64_SIMD_H
/* end file src/simdutf/arm64/simd.h */

/* begin file src/simdutf/arm64/end.h */
/* end file src/simdutf/arm64/end.h */

#endif // SIMDUTF_IMPLEMENTATION_ARM64

#endif // SIMDUTF_ARM64_H
/* end file src/simdutf/arm64.h */
/* begin file src/simdutf/icelake.h */
#ifndef SIMDUTF_ICELAKE_H
#define SIMDUTF_ICELAKE_H


#ifdef __has_include
  // How do we detect that a compiler supports vbmi2?
  // For sure if the following header is found, we are ok?
  #if __has_include(<avx512vbmi2intrin.h>)
    #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
  #endif
#endif

#ifdef _MSC_VER
  #if _MSC_VER >= 1930
    // Visual Studio 2022 and up support VBMI2 under x64 even if the header
    // avx512vbmi2intrin.h is not found.
    // Visual Studio 2019 technically supports VBMI2, but the implementation
    // might be unreliable. Search for visualstudio2019icelakeissue in our
    // tests.
    #ifndef SIMDUTF_COMPILER_SUPPORTS_VBMI2
      #define SIMDUTF_COMPILER_SUPPORTS_VBMI2 1
    #endif
  #endif
#endif

#if SIMDUTF_GCC9OROLDER && SIMDUTF_IS_X86_64
  #define SIMDUTF_IMPLEMENTATION_ICELAKE 0
  #warning                                                                     \
      "You are using a legacy GCC compiler, we are disabling AVX-512 support"
#endif

// We allow icelake on x64 as long as the compiler is known to support VBMI2.
#ifndef SIMDUTF_IMPLEMENTATION_ICELAKE
  #define SIMDUTF_IMPLEMENTATION_ICELAKE                                       \
    ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
#endif

// To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
// https://github.com/simdutf/simdutf/issues/1247
#if ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) &&  \
     (SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL &&   \
      SIMDUTF_HAS_AVX512VBMI2) &&                                              \
     (!SIMDUTF_IS_32BITS))
  #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 1
#else
  #define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE 0
#endif

#if SIMDUTF_IMPLEMENTATION_ICELAKE
  #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
    #define SIMDUTF_TARGET_ICELAKE
  #else
    #define SIMDUTF_TARGET_ICELAKE                                             \
      SIMDUTF_TARGET_REGION(                                                   \
          "avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,"         \
          "avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512vpopcntdq")
  #endif

namespace simdutf {
namespace icelake {} // namespace icelake
} // namespace simdutf

  //
  // These two need to be included outside SIMDUTF_TARGET_REGION
  //
/* begin file src/simdutf/icelake/intrinsics.h */
#ifndef SIMDUTF_ICELAKE_INTRINSICS_H
#define SIMDUTF_ICELAKE_INTRINSICS_H


#ifdef SIMDUTF_VISUAL_STUDIO
  // under clang within visual studio, this will include <x86intrin.h>
  #include <intrin.h> // visual studio or clang
  #include <immintrin.h>
#else

  #if SIMDUTF_GCC11ORMORE
// We should not get warnings while including <x86intrin.h> yet we do
// under some versions of GCC.
// If the x86intrin.h header has uninitialized values that are problematic,
// it is a GCC issue, we want to ignore these warnings.
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
  #endif

  #include <x86intrin.h> // elsewhere

  #if SIMDUTF_GCC11ORMORE
// cancels the suppression of the -Wuninitialized
SIMDUTF_POP_DISABLE_WARNINGS
  #endif

  #ifndef _tzcnt_u64
    #define _tzcnt_u64(x) __tzcnt_u64(x)
  #endif // _tzcnt_u64
#endif   // SIMDUTF_VISUAL_STUDIO

#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
  /**
   * You are not supposed, normally, to include these
   * headers directly. Instead you should either include intrin.h
   * or x86intrin.h. However, when compiling with clang
   * under Windows (i.e., when _MSC_VER is set), these headers
   * only get included *if* the corresponding features are detected
   * from macros:
   * e.g., if __AVX2__ is set... in turn,  we normally set these
   * macros by compiling against the corresponding architecture
   * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
   * software with these advanced instructions. In simdutf, we
   * want to compile the whole program for a generic target,
   * and only target our specific kernels. As a workaround,
   * we directly include the needed headers. These headers would
   * normally guard against such usage, but we carefully included
   * <x86intrin.h>  (or <intrin.h>) before, so the headers
   * are fooled.
   */
  #include <bmiintrin.h>   // for _blsr_u64
  #include <bmi2intrin.h>  // for _pext_u64, _pdep_u64
  #include <lzcntintrin.h> // for  __lzcnt64
  #include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
  #include <smmintrin.h>
  #include <tmmintrin.h>
  #include <avxintrin.h>
  #include <avx2intrin.h>
  // Important: we need the AVX-512 headers:
  #include <avx512fintrin.h>
  #include <avx512dqintrin.h>
  #include <avx512cdintrin.h>
  #include <avx512bwintrin.h>
  #include <avx512vlintrin.h>
  #include <avx512vlbwintrin.h>
  #include <avx512vbmiintrin.h>
  #include <avx512vbmi2intrin.h>
  #include <avx512vpopcntdqintrin.h>
  #include <avx512vpopcntdqvlintrin.h>
  // unfortunately, we may not get _blsr_u64, but, thankfully, clang
  // has it as a macro.
  #ifndef _blsr_u64
    // we roll our own
    #define _blsr_u64(n) ((n - 1) & n)
  #endif //  _blsr_u64
#endif   // SIMDUTF_CLANG_VISUAL_STUDIO

#if defined(__GNUC__) && !defined(__clang__)

  #if __GNUC__ == 8
    #define SIMDUTF_GCC8 1
  #elif __GNUC__ == 9
    #define SIMDUTF_GCC9 1
  #endif //  __GNUC__ == 8 || __GNUC__ == 9

#endif // defined(__GNUC__) && !defined(__clang__)

#if SIMDUTF_GCC8
  #pragma GCC push_options
  #pragma GCC target("avx512f")
/**
 * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
 */
inline __m512i
_mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4,
                uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9,
                uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14,
                uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19,
                uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24,
                uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29,
                uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34,
                uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39,
                uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44,
                uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49,
                uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54,
                uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59,
                uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
  return _mm512_set_epi64(
      uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) +
          (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) +
          (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
      uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) +
          (uint64_t(a12) << 24) + (uint64_t(a11) << 32) +
          (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
      uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) +
          (uint64_t(a20) << 24) + (uint64_t(a19) << 32) +
          (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
      uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) +
          (uint64_t(a28) << 24) + (uint64_t(a27) << 32) +
          (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
      uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) +
          (uint64_t(a36) << 24) + (uint64_t(a35) << 32) +
          (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
      uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) +
          (uint64_t(a44) << 24) + (uint64_t(a43) << 32) +
          (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
      uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) +
          (uint64_t(a52) << 24) + (uint64_t(a51) << 32) +
          (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
      uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) +
          (uint64_t(a60) << 24) + (uint64_t(a59) << 32) +
          (uint64_t(a58) << 40) + (uint64_t(a57) << 48) +
          (uint64_t(a56) << 56));
}
  #pragma GCC pop_options
#endif // SIMDUTF_GCC8

#endif // SIMDUTF_HASWELL_INTRINSICS_H
/* end file src/simdutf/icelake/intrinsics.h */
/* begin file src/simdutf/icelake/implementation.h */
#ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
#define SIMDUTF_ICELAKE_IMPLEMENTATION_H


namespace simdutf {
namespace icelake {

namespace {
using namespace simdutf;
}

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation(
            "icelake",
            "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 "
            "extensions)",
            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 |
                internal::instruction_set::BMI2 |
                internal::instruction_set::AVX512BW |
                internal::instruction_set::AVX512CD |
                internal::instruction_set::AVX512VL |
                internal::instruction_set::AVX512VBMI2 |
                internal::instruction_set::AVX512VPOPCNTDQ) {}

  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;

  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;

  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;

  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;

  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override;
};

} // namespace icelake
} // namespace simdutf

#endif // SIMDUTF_ICELAKE_IMPLEMENTATION_H
/* end file src/simdutf/icelake/implementation.h */

  //
  // The rest need to be inside the region
  //
/* begin file src/simdutf/icelake/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "icelake"
// #define SIMDUTF_IMPLEMENTATION icelake

#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
// nothing needed.
#else
SIMDUTF_TARGET_ICELAKE
#endif

#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
// clang-format off
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
// clang-format on
#endif // end of workaround
/* end file src/simdutf/icelake/begin.h */
  // Declarations
/* begin file src/simdutf/icelake/bitmanipulation.h */
#ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
#define SIMDUTF_ICELAKE_BITMANIPULATION_H

namespace simdutf {
namespace icelake {
namespace {

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num); // Visual Studio wants two underscores
}
#else
simdutf_really_inline long long int count_ones(uint64_t input_num) {
  return _popcnt64(input_num);
}
#endif

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simdutf_really_inline unsigned __int64 count_ones32(uint32_t input_num) {
  // note: we do not support legacy 32-bit Windows
  return __popcnt(input_num); // Visual Studio wants two underscores
}
#else
simdutf_really_inline long long int count_ones32(uint32_t input_num) {
  return _popcnt32(input_num);
}
#endif

#if SIMDUTF_NEED_TRAILING_ZEROES
// simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
//   #if SIMDUTF_REGULAR_VISUAL_STUDIO
//   return (int)_tzcnt_u64(input_num);
//   #else  // SIMDUTF_REGULAR_VISUAL_STUDIO
//   return __builtin_ctzll(input_num);
//   #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
// }
#endif

} // unnamed namespace
} // namespace icelake
} // namespace simdutf

#endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
/* end file src/simdutf/icelake/bitmanipulation.h */
/* begin file src/simdutf/icelake/simd.h */
#ifndef SIMDUTF_ICELAKE_SIMD_H
#define SIMDUTF_ICELAKE_SIMD_H

namespace simdutf {
namespace icelake {
namespace {
namespace simd {

/* begin file src/simdutf/icelake/simd16-inl.h */
template <typename T> struct simd16;

template <> struct simd16<uint16_t> {
  static const size_t SIZE = sizeof(__m512i);
  static const size_t ELEMENTS = SIZE / sizeof(uint16_t);

  template <typename Pointer>
  static simdutf_really_inline simd16<uint16_t> load(const Pointer *ptr) {
    return simd16<uint16_t>(ptr);
  }

  __m512i value;

  simdutf_really_inline simd16(const __m512i v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd16(const Pointer *ptr)
      : value(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr))) {}

  // operators
  simdutf_really_inline simd16 &operator+=(const simd16 other) {
    value = _mm512_add_epi32(value, other.value);
    return *this;
  }

  simdutf_really_inline simd16 &operator-=(const simd16 other) {
    value = _mm512_sub_epi32(value, other.value);
    return *this;
  }

  // methods
  simdutf_really_inline simd16 swap_bytes() const {
    const __m512i byteflip = _mm512_setr_epi64(
        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
        0x0607040502030001, 0x0e0f0c0d0a0b0809);

    return _mm512_shuffle_epi8(value, byteflip);
  }

  simdutf_really_inline uint64_t sum() const {
    const auto lo = _mm512_and_si512(value, _mm512_set1_epi32(0xffff));
    const auto hi = _mm512_srli_epi32(value, 16);
    const auto sum32 = _mm512_add_epi32(lo, hi);

    return _mm512_reduce_add_epi32(sum32);
  }

  // static members
  simdutf_really_inline static simd16<uint16_t> zero() {
    return _mm512_setzero_si512();
  }

  simdutf_really_inline static simd16<uint16_t> splat(uint16_t v) {
    return _mm512_set1_epi16(v);
  }
};

template <> struct simd16<bool> {
  __mmask32 value;

  simdutf_really_inline simd16(const __mmask32 v) : value(v) {}
};

// ------------------------------------------------------------

simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> b,
                                           const simd16<uint16_t> a) {
  return _mm512_min_epu16(a.value, b.value);
}

simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> a,
                                                 uint16_t b) {
  return _mm512_and_si512(a.value, _mm512_set1_epi16(b));
}

simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
                                                 uint16_t b) {
  return _mm512_xor_si512(a.value, _mm512_set1_epi16(b));
}

simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
                                                 const simd16<uint16_t> b) {
  return _mm512_xor_si512(a.value, b.value);
}

simdutf_really_inline simd16<bool> operator==(const simd16<uint16_t> a,
                                              uint16_t b) {
  return _mm512_cmpeq_epi16_mask(a.value, _mm512_set1_epi16(b));
}
/* end file src/simdutf/icelake/simd16-inl.h */
/* begin file src/simdutf/icelake/simd32-inl.h */
template <typename T> struct simd32;

template <> struct simd32<uint32_t> {
  static const size_t SIZE = sizeof(__m512i);
  static const size_t ELEMENTS = SIZE / sizeof(uint32_t);

  __m512i value;

  simdutf_really_inline simd32(const __m512i v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd32(const Pointer *ptr)
      : value(_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr))) {}

  uint64_t sum() const {
    const __m512i mask = _mm512_set1_epi64(0xffffffff);
    const __m512i t0 = _mm512_and_si512(value, mask);
    const __m512i t1 = _mm512_srli_epi64(value, 32);
    const __m512i t2 = _mm512_add_epi64(t0, t1);
    return _mm512_reduce_add_epi64(t2);
  }

  // operators
  simdutf_really_inline simd32 &operator+=(const simd32 other) {
    value = _mm512_add_epi32(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd32<uint32_t> zero() {
    return _mm512_setzero_si512();
  }

  simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
    return _mm512_set1_epi32(v);
  }
};

simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> b,
                                           const simd32<uint32_t> a) {
  return _mm512_min_epu32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> b,
                                                 const simd32<uint32_t> a) {
  return _mm512_and_si512(a.value, b.value);
}
/* end file src/simdutf/icelake/simd32-inl.h */

} // namespace simd
} // unnamed namespace
} // namespace icelake
} // namespace simdutf

#endif // SIMDUTF_ICELAKE_SIMD_H
/* end file src/simdutf/icelake/simd.h */

/* begin file src/simdutf/icelake/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif


#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
SIMDUTF_POP_DISABLE_WARNINGS
#endif // end of workaround
/* end file src/simdutf/icelake/end.h */

#endif // SIMDUTF_IMPLEMENTATION_ICELAKE
#endif // SIMDUTF_ICELAKE_H
/* end file src/simdutf/icelake.h */
/* begin file src/simdutf/haswell.h */
#ifndef SIMDUTF_HASWELL_H
#define SIMDUTF_HASWELL_H

#ifdef SIMDUTF_WESTMERE_H
  #error "haswell.h must be included before westmere.h"
#endif
#ifdef SIMDUTF_FALLBACK_H
  #error "haswell.h must be included before fallback.h"
#endif


// Default Haswell to on if this is x86-64. Even if we are not compiled for it,
// it could be selected at runtime.
#ifndef SIMDUTF_IMPLEMENTATION_HASWELL
  //
  // You do not want to restrict it like so: SIMDUTF_IS_X86_64 && __AVX2__
  // because we want to rely on *runtime dispatch*.
  //
  #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
    #define SIMDUTF_IMPLEMENTATION_HASWELL 0
  #else
    #define SIMDUTF_IMPLEMENTATION_HASWELL (SIMDUTF_IS_X86_64)
  #endif

#endif
// To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
// https://github.com/simdutf/simdutf/issues/1247
#if ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
  #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 1
#else
  #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL 0
#endif

#if SIMDUTF_IMPLEMENTATION_HASWELL

  #define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")

namespace simdutf {
/**
 * Implementation for Haswell (Intel AVX2).
 */
namespace haswell {} // namespace haswell
} // namespace simdutf

  //
  // These two need to be included outside SIMDUTF_TARGET_REGION
  //
/* begin file src/simdutf/haswell/implementation.h */
#ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
#define SIMDUTF_HASWELL_IMPLEMENTATION_H


// The constructor may be executed on any host, so we take care not to use
// SIMDUTF_TARGET_REGION
namespace simdutf {
namespace haswell {

using namespace simdutf;

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("haswell", "Intel/AMD AVX2",
                                internal::instruction_set::AVX2 |
                                    internal::instruction_set::BMI1 |
                                    internal::instruction_set::BMI2) {}

  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;

  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;

  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;

  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;

  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override;
};

} // namespace haswell
} // namespace simdutf

#endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
/* end file src/simdutf/haswell/implementation.h */
/* begin file src/simdutf/haswell/intrinsics.h */
#ifndef SIMDUTF_HASWELL_INTRINSICS_H
#define SIMDUTF_HASWELL_INTRINSICS_H


#ifdef SIMDUTF_VISUAL_STUDIO
  // under clang within visual studio, this will include <x86intrin.h>
  #include <intrin.h> // visual studio or clang
#else

  #if SIMDUTF_GCC11ORMORE
// We should not get warnings while including <x86intrin.h> yet we do
// under some versions of GCC.
// If the x86intrin.h header has uninitialized values that are problematic,
// it is a GCC issue, we want to ignore these warnings.
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
  #endif

  #include <x86intrin.h> // elsewhere

  #if SIMDUTF_GCC11ORMORE
// cancels the suppression of the -Wuninitialized
SIMDUTF_POP_DISABLE_WARNINGS
  #endif

#endif // SIMDUTF_VISUAL_STUDIO

#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
  /**
   * You are not supposed, normally, to include these
   * headers directly. Instead you should either include intrin.h
   * or x86intrin.h. However, when compiling with clang
   * under Windows (i.e., when _MSC_VER is set), these headers
   * only get included *if* the corresponding features are detected
   * from macros:
   * e.g., if __AVX2__ is set... in turn,  we normally set these
   * macros by compiling against the corresponding architecture
   * (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
   * software with these advanced instructions. In simdutf, we
   * want to compile the whole program for a generic target,
   * and only target our specific kernels. As a workaround,
   * we directly include the needed headers. These headers would
   * normally guard against such usage, but we carefully included
   * <x86intrin.h>  (or <intrin.h>) before, so the headers
   * are fooled.
   */
  #include <bmiintrin.h>   // for _blsr_u64
  #include <lzcntintrin.h> // for  __lzcnt64
  #include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
  #include <smmintrin.h>
  #include <tmmintrin.h>
  #include <avxintrin.h>
  #include <avx2intrin.h>
  // unfortunately, we may not get _blsr_u64, but, thankfully, clang
  // has it as a macro.
  #ifndef _blsr_u64
    // we roll our own
    #define _blsr_u64(n) (((n) - 1) & (n))
  #endif //  _blsr_u64
  // Same issue with _blsmsk_u32:
  #ifndef _blsmsk_u32
    // we roll our own
    #define _blsmsk_u32(n) (((n) - 1) ^ (n))
  #endif //  _blsmsk_u32
#endif   // SIMDUTF_CLANG_VISUAL_STUDIO

#endif // SIMDUTF_HASWELL_INTRINSICS_H
/* end file src/simdutf/haswell/intrinsics.h */

  //
  // The rest need to be inside the region
  //
/* begin file src/simdutf/haswell/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "haswell"
// #define SIMDUTF_IMPLEMENTATION haswell
#define SIMDUTF_SIMD_HAS_BYTEMASK 1

#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
// nothing needed.
#else
SIMDUTF_TARGET_HASWELL
#endif

#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
// clang-format off
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
// clang-format on
#endif // end of workaround
/* end file src/simdutf/haswell/begin.h */
  // Declarations
/* begin file src/simdutf/haswell/bitmanipulation.h */
#ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
#define SIMDUTF_HASWELL_BITMANIPULATION_H

namespace simdutf {
namespace haswell {
namespace {

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num); // Visual Studio wants two underscores
}
#else
simdutf_really_inline long long int count_ones(uint64_t input_num) {
  return _popcnt64(input_num);
}
#endif

#if SIMDUTF_NEED_TRAILING_ZEROES
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
  #if SIMDUTF_REGULAR_VISUAL_STUDIO
  return (int)_tzcnt_u64(input_num);
  #else  // SIMDUTF_REGULAR_VISUAL_STUDIO
  return __builtin_ctzll(input_num);
  #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
}
#endif

template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }

} // unnamed namespace
} // namespace haswell
} // namespace simdutf

#endif // SIMDUTF_HASWELL_BITMANIPULATION_H
/* end file src/simdutf/haswell/bitmanipulation.h */
/* begin file src/simdutf/haswell/simd.h */
#ifndef SIMDUTF_HASWELL_SIMD_H
#define SIMDUTF_HASWELL_SIMD_H

namespace simdutf {
namespace haswell {
namespace {
namespace simd {

// Forward-declared so they can be used by splat and friends.
template <typename Child> struct base {
  __m256i value;

  // Zero constructor
  simdutf_really_inline base() : value{__m256i()} {}

  // Conversion from SIMD register
  simdutf_really_inline base(const __m256i _value) : value(_value) {}

  simdutf_really_inline operator const __m256i &() const { return this->value; }

  template <endianness big_endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
    __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1));
    if (big_endian) {
      const __m256i swap = _mm256_setr_epi8(
          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
      first = _mm256_shuffle_epi8(first, swap);
      second = _mm256_shuffle_epi8(second, swap);
    }
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
                        _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 8),
                        _mm256_cvtepu8_epi32(_mm256_castsi256_si128(
                            _mm256_srli_si256(*this, 8))));
    _mm256_storeu_si256(
        reinterpret_cast<__m256i *>(ptr + 16),
        _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1)));
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24),
                        _mm256_cvtepu8_epi32(_mm_srli_si128(
                            _mm256_extractf128_si256(*this, 1), 8)));
  }
  // Bit operations
  simdutf_really_inline Child operator|(const Child other) const {
    return _mm256_or_si256(*this, other);
  }
  simdutf_really_inline Child operator&(const Child other) const {
    return _mm256_and_si256(*this, other);
  }
  simdutf_really_inline Child operator^(const Child other) const {
    return _mm256_xor_si256(*this, other);
  }
  simdutf_really_inline Child &operator|=(const Child other) {
    auto this_cast = static_cast<Child *>(this);
    *this_cast = *this_cast | other;
    return *this_cast;
  }
};

// Forward-declared so they can be used by splat and friends.
template <typename T> struct simd8;

template <typename T, typename Mask = simd8<bool>>
struct base8 : base<simd8<T>> {
  simdutf_really_inline base8() : base<simd8<T>>() {}

  simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}

  friend simdutf_always_inline Mask operator==(const simd8<T> lhs,
                                               const simd8<T> rhs) {
    return _mm256_cmpeq_epi8(lhs, rhs);
  }

  static const int SIZE = sizeof(base<T>::value);

  template <int N = 1>
  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
    return _mm256_alignr_epi8(
        *this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base8<bool> {
  static simdutf_really_inline simd8<bool> splat(bool _value) {
    return _mm256_set1_epi8(uint8_t(-(!!_value)));
  }

  simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}

  simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}

  simdutf_really_inline uint32_t to_bitmask() const {
    return uint32_t(_mm256_movemask_epi8(value));
  }
};

template <typename T> struct base8_numeric : base8<T> {
  static simdutf_really_inline simd8<T> splat(T _value) {
    return _mm256_set1_epi8(_value);
  }
  static simdutf_really_inline simd8<T> zero() {
    return _mm256_setzero_si256();
  }
  static simdutf_really_inline simd8<T> load(const T values[32]) {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
  }
  // Repeat 16 values as many times as necessary (usually for lookup tables)
  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
                                                  T v5, T v6, T v7, T v8, T v9,
                                                  T v10, T v11, T v12, T v13,
                                                  T v14, T v15) {
    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
                    v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
                    v12, v13, v14, v15);
  }

  simdutf_really_inline base8_numeric() : base8<T>() {}
  simdutf_really_inline base8_numeric(const __m256i _value)
      : base8<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[32]) const {
    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
  }

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
    return _mm256_sub_epi8(*this, other);
  }
  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
    *this = *this - other;
    return *static_cast<simd8<T> *>(this);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }

  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
  // for out of range values)
  template <typename L>
  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
    return _mm256_shuffle_epi8(lookup_table, *this);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
            L replace5, L replace6, L replace7, L replace8, L replace9,
            L replace10, L replace11, L replace12, L replace13, L replace14,
            L replace15) const {
    return lookup_16(simd8<L>::repeat_16(
        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
        replace7, replace8, replace9, replace10, replace11, replace12,
        replace13, replace14, replace15));
  }
};

// Signed bytes
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
  simdutf_really_inline simd8(const __m256i _value)
      : base8_numeric<int8_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
  simdutf_really_inline operator simd8<uint8_t>() const;

  simdutf_really_inline bool is_ascii() const {
    return _mm256_movemask_epi8(*this) == 0;
  }
  // Order-sensitive comparisons
  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
    return _mm256_cmpgt_epi8(*this, other);
  }
  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
    return _mm256_cmpgt_epi8(other, *this);
  }
};

// Unsigned bytes
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
  simdutf_really_inline simd8(const __m256i _value)
      : base8_numeric<uint8_t>(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
  // Member-by-member initialization
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
        uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
        uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
        uint8_t v31)
      : simd8(_mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
                               v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
                               v22, v23, v24, v25, v26, v27, v28, v29, v30,
                               v31)) {}

  // Saturated math
  simdutf_really_inline simd8<uint8_t>
  saturating_sub(const simd8<uint8_t> other) const {
    return _mm256_subs_epu8(*this, other);
  }

  // Order-specific operations
  simdutf_really_inline simd8<uint8_t>
  min_val(const simd8<uint8_t> other) const {
    return _mm256_min_epu8(other, *this);
  }
  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd8<uint8_t>
  gt_bits(const simd8<uint8_t> other) const {
    return this->saturating_sub(other);
  }
  simdutf_really_inline simd8<bool>
  operator>=(const simd8<uint8_t> other) const {
    return other.min_val(*this) == other;
  }

  // Bit-specific operations
  simdutf_really_inline bool is_ascii() const {
    return _mm256_movemask_epi8(*this) == 0;
  }
  simdutf_really_inline bool bits_not_set_anywhere() const {
    return _mm256_testz_si256(*this, *this);
  }

  simdutf_really_inline bool any_bits_set_anywhere() const {
    return !bits_not_set_anywhere();
  }

  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
    return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N);
  }

  simdutf_really_inline uint64_t sum_bytes() const {
    const auto tmp = _mm256_sad_epu8(value, _mm256_setzero_si256());

    return _mm256_extract_epi64(tmp, 0) + _mm256_extract_epi64(tmp, 1) +
           _mm256_extract_epi64(tmp, 2) + _mm256_extract_epi64(tmp, 3);
  }
};
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
  return this->value;
}

template <typename T> struct simd8x64 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
  static_assert(NUM_CHUNKS == 2,
                "Haswell kernel should use two registers per 64-byte block.");
  simd8<T> chunks[NUM_CHUNKS];

  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
  simd8x64<T> &
  operator=(const simd8<T> other) = delete; // no assignment allowed
  simd8x64() = delete;                      // no default constructor allowed

  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
      : chunks{chunk0, chunk1} {}
  simdutf_really_inline simd8x64(const T *ptr)
      : chunks{simd8<T>::load(ptr),
               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r_hi = this->chunks[1].to_bitmask();
    return r_lo | (r_hi << 32);
  }

  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
    this->chunks[0] |= other.chunks[0];
    this->chunks[1] |= other.chunks[1];
    return *this;
  }

  simdutf_really_inline simd8<T> reduce_or() const {
    return this->chunks[0] | this->chunks[1];
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  template <endianness endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 0);
    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 1);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
    this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
  }

  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
    const simd8<T> mask_low = simd8<T>::splat(low);
    const simd8<T> mask_high = simd8<T>::splat(high);

    return simd8x64<bool>(
               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
        .to_bitmask();
  }

  simdutf_really_inline uint64_t lt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t eq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
    return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
                          (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
        .to_bitmask();
  }
}; // struct simd8x64<T>

/* begin file src/simdutf/haswell/simd16-inl.h */
#ifdef __GNUC__
  #if __GNUC__ < 8
    #define _mm256_set_m128i(xmm1, xmm2)                                       \
      _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1),                  \
                                _mm256_castsi128_si256(xmm2), 2)
    #define _mm256_setr_m128i(xmm2, xmm1)                                      \
      _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1),                  \
                                _mm256_castsi128_si256(xmm2), 2)
  #endif
#endif

template <typename T> struct simd16;

template <typename T, typename Mask = simd16<bool>>
struct base16 : base<simd16<T>> {
  using bitmask_type = uint32_t;

  simdutf_really_inline base16() : base<simd16<T>>() {}
  simdutf_really_inline base16(const __m256i _value)
      : base<simd16<T>>(_value) {}
  template <typename Pointer>
  simdutf_really_inline base16(const Pointer *ptr)
      : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}

  friend simdutf_always_inline Mask operator==(const simd16<T> lhs,
                                               const simd16<T> rhs) {
    return _mm256_cmpeq_epi16(lhs, rhs);
  }

  /// the size of vector in bytes
  static const int SIZE = sizeof(base<simd16<T>>::value);

  /// the number of elements of type T a vector can hold
  static const int ELEMENTS = SIZE / sizeof(T);
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd16<bool> : base16<bool> {
  static simdutf_really_inline simd16<bool> splat(bool _value) {
    return _mm256_set1_epi16(uint16_t(-(!!_value)));
  }

  simdutf_really_inline simd16() : base16() {}

  simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}

  simdutf_really_inline bitmask_type to_bitmask() const {
    return _mm256_movemask_epi8(*this);
  }

  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
};

template <typename T> struct base16_numeric : base16<T> {
  static simdutf_really_inline simd16<T> splat(T _value) {
    return _mm256_set1_epi16(_value);
  }

  static simdutf_really_inline simd16<T> zero() {
    return _mm256_setzero_si256();
  }

  static simdutf_really_inline simd16<T> load(const T values[8]) {
    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
  }

  simdutf_really_inline base16_numeric() : base16<T>() {}

  simdutf_really_inline base16_numeric(const __m256i _value)
      : base16<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[8]) const {
    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
    return _mm256_add_epi16(*this, other);
  }
  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
    *this = *this + other;
    return *static_cast<simd16<T> *>(this);
  }
};

// Unsigned code units
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
  simdutf_really_inline simd16(const __m256i _value)
      : base16_numeric<uint16_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}

  // Order-specific operations
  simdutf_really_inline simd16<uint16_t>
  max_val(const simd16<uint16_t> other) const {
    return _mm256_max_epu16(*this, other);
  }
  simdutf_really_inline simd16<uint16_t>
  min_val(const simd16<uint16_t> other) const {
    return _mm256_min_epu16(*this, other);
  }
  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd16<bool>
  operator<=(const simd16<uint16_t> other) const {
    return other.max_val(*this) == other;
  }
  simdutf_really_inline simd16<bool>
  operator>=(const simd16<uint16_t> other) const {
    return other.min_val(*this) == other;
  }

  // Bit-specific operations
  simdutf_really_inline simd16<bool> bits_not_set() const {
    return *this == uint16_t(0);
  }

  simdutf_really_inline simd16<bool> any_bits_set() const {
    return ~this->bits_not_set();
  }

  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
    return simd16<uint16_t>(_mm256_srli_epi16(*this, N));
  }

  // Change the endianness
  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
    const __m256i swap = _mm256_setr_epi8(
        1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
        21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
    return _mm256_shuffle_epi8(*this, swap);
  }

  // Pack with the unsigned saturation of two uint16_t code units into single
  // uint8_t vector
  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
                                                   const simd16<uint16_t> &v1) {
    // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
    //       we have to shuffle lanes in order to produce bytes in the
    //       correct order.

    // get the 0th lanes
    const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
    const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);

    // get the 1st lanes
    const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
    const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);

    // build new vectors (shuffle lanes)
    const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
    const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);

    // pack code units in linear order from v0 and v1
    return _mm256_packus_epi16(t0, t1);
  }

  simdutf_really_inline uint64_t sum() const {
    const auto lo_u16 = _mm256_and_si256(value, _mm256_set1_epi32(0x0000ffff));
    const auto hi_u16 = _mm256_srli_epi32(value, 16);
    const auto sum_u32 = _mm256_add_epi32(lo_u16, hi_u16);

    const auto lo_u32 =
        _mm256_and_si256(sum_u32, _mm256_set1_epi64x(0xffffffff));
    const auto hi_u32 = _mm256_srli_epi64(sum_u32, 32);
    const auto sum_u64 = _mm256_add_epi64(lo_u32, hi_u32);

    return uint64_t(_mm256_extract_epi64(sum_u64, 0)) +
           uint64_t(_mm256_extract_epi64(sum_u64, 1)) +
           uint64_t(_mm256_extract_epi64(sum_u64, 2)) +
           uint64_t(_mm256_extract_epi64(sum_u64, 3));
  }
};

template <typename T> struct simd16x32 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
  static_assert(NUM_CHUNKS == 2,
                "Haswell kernel should use two registers per 64-byte block.");
  simd16<T> chunks[NUM_CHUNKS];

  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
  simd16x32<T> &
  operator=(const simd16<T> other) = delete; // no assignment allowed
  simd16x32() = delete;                      // no default constructor allowed

  simdutf_really_inline simd16x32(const simd16<T> chunk0,
                                  const simd16<T> chunk1)
      : chunks{chunk0, chunk1} {}
  simdutf_really_inline simd16x32(const T *ptr)
      : chunks{simd16<T>::load(ptr),
               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r_hi = this->chunks[1].to_bitmask();
    return r_lo | (r_hi << 32);
  }

  simdutf_really_inline simd16<T> reduce_or() const {
    return this->chunks[0] | this->chunks[1];
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
  }

  simdutf_really_inline void swap_bytes() {
    this->chunks[0] = this->chunks[0].swap_bytes();
    this->chunks[1] = this->chunks[1].swap_bytes();
  }
  simdutf_really_inline uint64_t gt(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t lteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t eq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
    const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
    const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
    return simd16x32<bool>(
               (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
               (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
        .to_bitmask();
  }
}; // struct simd16x32<T>

simd16<uint16_t> min(const simd16<uint16_t> a, simd16<uint16_t> b) {
  return _mm256_min_epu16(a.value, b.value);
}
/* end file src/simdutf/haswell/simd16-inl.h */
/* begin file src/simdutf/haswell/simd32-inl.h */
template <typename T> struct simd32;

template <> struct simd32<uint32_t> {
  static const size_t SIZE = sizeof(__m256i);
  static const size_t ELEMENTS = SIZE / sizeof(uint32_t);

  __m256i value;

  simdutf_really_inline simd32(const __m256i v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd32(const Pointer *ptr)
      : value(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}

  simdutf_really_inline uint64_t sum() const {
    const __m256i mask = _mm256_set1_epi64x(0xffffffff);
    const __m256i t0 = _mm256_and_si256(value, mask);
    const __m256i t1 = _mm256_srli_epi64(value, 32);
    const __m256i t2 = _mm256_add_epi64(t0, t1);

    return uint64_t(_mm256_extract_epi64(t2, 0)) +
           uint64_t(_mm256_extract_epi64(t2, 1)) +
           uint64_t(_mm256_extract_epi64(t2, 2)) +
           uint64_t(_mm256_extract_epi64(t2, 3));
  }

  simdutf_really_inline simd32<uint32_t> swap_bytes() const {
    const __m256i shuffle =
        _mm256_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12,
                         3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12);

    return _mm256_shuffle_epi8(value, shuffle);
  }

  // operators
  simdutf_really_inline simd32 &operator+=(const simd32 other) {
    value = _mm256_add_epi32(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd32<uint32_t> zero() {
    return _mm256_setzero_si256();
  }

  simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
    return _mm256_set1_epi32(v);
  }
};

//----------------------------------------------------------------------

template <> struct simd32<bool> {
  // static const size_t SIZE = sizeof(__m128i);
  // static const size_t ELEMENTS = SIZE / sizeof(uint32_t);

  __m256i value;

  simdutf_really_inline simd32(const __m256i v) : value(v) {}

  simdutf_really_inline bool any() const {
    return _mm256_movemask_epi8(value) != 0;
  }
};

//----------------------------------------------------------------------

template <typename T>
simdutf_really_inline simd32<T> operator|(const simd32<T> a,
                                          const simd32<T> b) {
  return _mm256_or_si256(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> b,
                                           const simd32<uint32_t> a) {
  return _mm256_min_epu32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
                                           const simd32<uint32_t> b) {
  return _mm256_max_epu32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> b,
                                                 const simd32<uint32_t> a) {
  return _mm256_and_si256(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return _mm256_add_epi32(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
                                              const simd32<uint32_t> b) {
  return _mm256_cmpeq_epi32(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
                                              const simd32<uint32_t> b) {
  return _mm256_cmpeq_epi32(_mm256_max_epu32(a.value, b.value), a.value);
}

simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
  return _mm256_xor_si256(v.value, _mm256_set1_epi8(-1));
}

simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return !(b >= a);
}
/* end file src/simdutf/haswell/simd32-inl.h */
/* begin file src/simdutf/haswell/simd64-inl.h */
template <typename T> struct simd64;

template <> struct simd64<uint64_t> {
  // static const size_t SIZE = sizeof(__m256i);
  // static const size_t ELEMENTS = SIZE / sizeof(uint64_t);

  __m256i value;

  simdutf_really_inline simd64(const __m256i v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd64(const Pointer *ptr)
      : value(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr))) {}

  simdutf_really_inline uint64_t sum() const {
    return _mm256_extract_epi64(value, 0) + _mm256_extract_epi64(value, 1) +
           _mm256_extract_epi64(value, 2) + _mm256_extract_epi64(value, 3);
  }

  // operators
  simdutf_really_inline simd64 &operator+=(const simd64 other) {
    value = _mm256_add_epi64(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd64<uint64_t> zero() {
    return _mm256_setzero_si256();
  }

  simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
    return _mm256_set1_epi64x(v);
  }
};
/* end file src/simdutf/haswell/simd64-inl.h */

simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
  return _mm256_sad_epu8(v.value, simd8<uint8_t>::zero());
}

} // namespace simd

} // unnamed namespace
} // namespace haswell
} // namespace simdutf

#endif // SIMDUTF_HASWELL_SIMD_H
/* end file src/simdutf/haswell/simd.h */

/* begin file src/simdutf/haswell/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif

#undef SIMDUTF_SIMD_HAS_BYTEMASK

#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
SIMDUTF_POP_DISABLE_WARNINGS
#endif // end of workaround
/* end file src/simdutf/haswell/end.h */

#endif // SIMDUTF_IMPLEMENTATION_HASWELL
#endif // SIMDUTF_HASWELL_COMMON_H
/* end file src/simdutf/haswell.h */
/* begin file src/simdutf/westmere.h */
#ifndef SIMDUTF_WESTMERE_H
#define SIMDUTF_WESTMERE_H

#ifdef SIMDUTF_FALLBACK_H
  #error "westmere.h must be included before fallback.h"
#endif


// Default Westmere to on if this is x86-64, unless we'll always select Haswell.
#ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
  //
  // You do not want to set it to (SIMDUTF_IS_X86_64 &&
  // !SIMDUTF_REQUIRES_HASWELL) because you want to rely on runtime dispatch!
  //
  #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL
    #define SIMDUTF_IMPLEMENTATION_WESTMERE 0
  #else
    #define SIMDUTF_IMPLEMENTATION_WESTMERE (SIMDUTF_IS_X86_64)
  #endif

#endif

#if (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
  #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 1
#else
  #define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE 0
#endif

#if SIMDUTF_IMPLEMENTATION_WESTMERE

  #define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")

namespace simdutf {
/**
 * Implementation for Westmere (Intel SSE4.2).
 */
namespace westmere {} // namespace westmere
} // namespace simdutf

  //
  // These two need to be included outside SIMDUTF_TARGET_REGION
  //
/* begin file src/simdutf/westmere/implementation.h */
#ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
#define SIMDUTF_WESTMERE_IMPLEMENTATION_H


// The constructor may be executed on any host, so we take care not to use
// SIMDUTF_TARGET_REGION
namespace simdutf {
namespace westmere {

namespace {
using namespace simdutf;
}

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("westmere", "Intel/AMD SSE4.2",
                                internal::instruction_set::SSE42) {}

  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;

  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;

  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;

  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;

  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override;
};

} // namespace westmere
} // namespace simdutf

#endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
/* end file src/simdutf/westmere/implementation.h */
/* begin file src/simdutf/westmere/intrinsics.h */
#ifndef SIMDUTF_WESTMERE_INTRINSICS_H
#define SIMDUTF_WESTMERE_INTRINSICS_H

#ifdef SIMDUTF_VISUAL_STUDIO
  // under clang within visual studio, this will include <x86intrin.h>
  #include <intrin.h> // visual studio or clang
#else

  #if SIMDUTF_GCC11ORMORE
// We should not get warnings while including <x86intrin.h> yet we do
// under some versions of GCC.
// If the x86intrin.h header has uninitialized values that are problematic,
// it is a GCC issue, we want to ignore these warnings.
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
  #endif

  #include <x86intrin.h> // elsewhere

  #if SIMDUTF_GCC11ORMORE
// cancels the suppression of the -Wuninitialized
SIMDUTF_POP_DISABLE_WARNINGS
  #endif

#endif // SIMDUTF_VISUAL_STUDIO

#ifdef SIMDUTF_CLANG_VISUAL_STUDIO
  /**
   * You are not supposed, normally, to include these
   * headers directly. Instead you should either include intrin.h
   * or x86intrin.h. However, when compiling with clang
   * under Windows (i.e., when _MSC_VER is set), these headers
   * only get included *if* the corresponding features are detected
   * from macros:
   */
  #include <smmintrin.h> // for _mm_alignr_epi8
#endif

#endif // SIMDUTF_WESTMERE_INTRINSICS_H
/* end file src/simdutf/westmere/intrinsics.h */

  //
  // The rest need to be inside the region
  //
/* begin file src/simdutf/westmere/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "westmere"
// #define SIMDUTF_IMPLEMENTATION westmere
#define SIMDUTF_SIMD_HAS_BYTEMASK 1

#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
// nothing needed.
#else
SIMDUTF_TARGET_WESTMERE
#endif
/* end file src/simdutf/westmere/begin.h */

  // Declarations
/* begin file src/simdutf/westmere/bitmanipulation.h */
#ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
#define SIMDUTF_WESTMERE_BITMANIPULATION_H

namespace simdutf {
namespace westmere {
namespace {

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num); // Visual Studio wants two underscores
}
#else
simdutf_really_inline long long int count_ones(uint64_t input_num) {
  return _popcnt64(input_num);
}
#endif

#if SIMDUTF_NEED_TRAILING_ZEROES
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
  #if SIMDUTF_REGULAR_VISUAL_STUDIO
  unsigned long ret;
  _BitScanForward64(&ret, input_num);
  return (int)ret;
  #else  // SIMDUTF_REGULAR_VISUAL_STUDIO
  return __builtin_ctzll(input_num);
  #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
}
#endif

template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }

} // unnamed namespace
} // namespace westmere
} // namespace simdutf

#endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
/* end file src/simdutf/westmere/bitmanipulation.h */
/* begin file src/simdutf/westmere/simd.h */
#ifndef SIMDUTF_WESTMERE_SIMD_H
#define SIMDUTF_WESTMERE_SIMD_H

namespace simdutf {
namespace westmere {
namespace {
namespace simd {

template <typename Child> struct base {
  __m128i value;

  // Zero constructor
  simdutf_really_inline base() : value{__m128i()} {}

  // Conversion from SIMD register
  simdutf_really_inline base(const __m128i _value) : value(_value) {}
  // Conversion to SIMD register
  simdutf_really_inline operator const __m128i &() const { return this->value; }
  template <endianness big_endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
    __m128i first = _mm_cvtepu8_epi16(*this);
    __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8));
    if (big_endian) {
      const __m128i swap =
          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
      first = _mm_shuffle_epi8(first, swap);
      second = _mm_shuffle_epi8(second, swap);
    }
    _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
    _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8), second);
  }
  simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
    _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 4),
                     _mm_cvtepu8_epi32(_mm_srli_si128(*this, 4)));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 8),
                     _mm_cvtepu8_epi32(_mm_srli_si128(*this, 8)));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(p + 12),
                     _mm_cvtepu8_epi32(_mm_srli_si128(*this, 12)));
  }
  // Bit operations
  simdutf_really_inline Child operator|(const Child other) const {
    return _mm_or_si128(*this, other);
  }
  simdutf_really_inline Child operator&(const Child other) const {
    return _mm_and_si128(*this, other);
  }
  simdutf_really_inline Child operator^(const Child other) const {
    return _mm_xor_si128(*this, other);
  }
  simdutf_really_inline Child &operator|=(const Child other) {
    auto this_cast = static_cast<Child *>(this);
    *this_cast = *this_cast | other;
    return *this_cast;
  }
};

// Forward-declared so they can be used by splat and friends.
template <typename T> struct simd8;

template <typename T, typename Mask = simd8<bool>>
struct base8 : base<simd8<T>> {
  typedef uint16_t bitmask_t;
  typedef uint32_t bitmask2_t;

  simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); }
  simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); }
  simdutf_really_inline base8() : base<simd8<T>>() {}
  simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}

  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
                                               const simd8<T> rhs) {
    return _mm_cmpeq_epi8(lhs, rhs);
  }

  static const int SIZE = sizeof(base<simd8<T>>::value);

  template <int N = 1>
  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
    return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base8<bool> {
  static simdutf_really_inline simd8<bool> splat(bool _value) {
    return _mm_set1_epi8(uint8_t(-(!!_value)));
  }

  simdutf_really_inline simd8() : base8() {}
  simdutf_really_inline simd8(const __m128i _value) : base8<bool>(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}

  simdutf_really_inline int to_bitmask() const {
    return _mm_movemask_epi8(*this);
  }
  simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
};

template <typename T> struct base8_numeric : base8<T> {
  static simdutf_really_inline simd8<T> splat(T _value) {
    return _mm_set1_epi8(_value);
  }
  static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
  static simdutf_really_inline simd8<T> load(const T values[16]) {
    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
  }
  // Repeat 16 values as many times as necessary (usually for lookup tables)
  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
                                                  T v5, T v6, T v7, T v8, T v9,
                                                  T v10, T v11, T v12, T v13,
                                                  T v14, T v15) {
    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
                    v14, v15);
  }

  simdutf_really_inline base8_numeric() : base8<T>() {}
  simdutf_really_inline base8_numeric(const __m128i _value)
      : base8<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[16]) const {
    return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
    return _mm_sub_epi8(*this, other);
  }
  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
    *this = *this - other;
    return *static_cast<simd8<T> *>(this);
  }

  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
  // for out of range values)
  template <typename L>
  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
    return _mm_shuffle_epi8(lookup_table, *this);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
            L replace5, L replace6, L replace7, L replace8, L replace9,
            L replace10, L replace11, L replace12, L replace13, L replace14,
            L replace15) const {
    return lookup_16(simd8<L>::repeat_16(
        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
        replace7, replace8, replace9, replace10, replace11, replace12,
        replace13, replace14, replace15));
  }
};

// Signed bytes
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
  simdutf_really_inline simd8(const __m128i _value)
      : base8_numeric<int8_t>(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
  // Member-by-member initialization
  simdutf_really_inline operator simd8<uint8_t>() const;
  simdutf_really_inline bool is_ascii() const {
    return _mm_movemask_epi8(*this) == 0;
  }

  // Order-sensitive comparisons
  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
    return _mm_cmpgt_epi8(*this, other);
  }
  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
    return _mm_cmpgt_epi8(other, *this);
  }
};

// Unsigned bytes
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
  simdutf_really_inline simd8(const __m128i _value)
      : base8_numeric<uint8_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
  // Member-by-member initialization
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
      : simd8(_mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
                            v12, v13, v14, v15)) {}

  // Saturated math
  simdutf_really_inline simd8<uint8_t>
  saturating_sub(const simd8<uint8_t> other) const {
    return _mm_subs_epu8(*this, other);
  }

  // Order-specific operations
  simdutf_really_inline simd8<uint8_t>
  min_val(const simd8<uint8_t> other) const {
    return _mm_min_epu8(*this, other);
  }
  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd8<uint8_t>
  gt_bits(const simd8<uint8_t> other) const {
    return this->saturating_sub(other);
  }
  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd8<bool>
  operator>=(const simd8<uint8_t> other) const {
    return other.min_val(*this) == other;
  }

  // Bit-specific operations
  simdutf_really_inline simd8<bool> bits_not_set() const {
    return *this == uint8_t(0);
  }
  simdutf_really_inline simd8<bool> any_bits_set() const {
    return ~this->bits_not_set();
  }
  simdutf_really_inline bool is_ascii() const {
    return _mm_movemask_epi8(*this) == 0;
  }

  simdutf_really_inline bool bits_not_set_anywhere() const {
    return _mm_testz_si128(*this, *this);
  }
  simdutf_really_inline bool any_bits_set_anywhere() const {
    return !bits_not_set_anywhere();
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
    return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N);
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
    return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N);
  }

  simdutf_really_inline uint64_t sum_bytes() const {
    const auto tmp = _mm_sad_epu8(value, _mm_setzero_si128());
    return _mm_extract_epi64(tmp, 0) + _mm_extract_epi64(tmp, 1);
  }
};

simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
  return this->value;
}

template <typename T> struct simd8x64 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
  static_assert(NUM_CHUNKS == 4,
                "Westmere kernel should use four registers per 64-byte block.");
  simd8<T> chunks[NUM_CHUNKS];

  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
  simd8x64<T> &
  operator=(const simd8<T> other) = delete; // no assignment allowed
  simd8x64() = delete;                      // no default constructor allowed

  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
                                 const simd8<T> chunk2, const simd8<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd8x64(const T *ptr)
      : chunks{simd8<T>::load(ptr),
               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
    this->chunks[0] |= other.chunks[0];
    this->chunks[1] |= other.chunks[1];
    this->chunks[2] |= other.chunks[2];
    this->chunks[3] |= other.chunks[3];
    return *this;
  }

  simdutf_really_inline simd8<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  template <endianness endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 0);
    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 1);
    this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 2);
    this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
    this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
    this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
    this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r1 = this->chunks[1].to_bitmask();
    uint64_t r2 = this->chunks[2].to_bitmask();
    uint64_t r3 = this->chunks[3].to_bitmask();
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
  }

  simdutf_really_inline uint64_t lt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
                          this->chunks[2] < mask, this->chunks[3] < mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
                          this->chunks[2] > mask, this->chunks[3] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
                          this->chunks[2] >= mask, this->chunks[3] >= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t eq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
                          this->chunks[2] == mask, this->chunks[3] == mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
    return simd8x64<bool>(simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
                          simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
                          simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
                          simd8<uint8_t>(__m128i(this->chunks[3])) >= mask)
        .to_bitmask();
  }
}; // struct simd8x64<T>

/* begin file src/simdutf/westmere/simd16-inl.h */
template <typename T> struct simd16;

template <typename T, typename Mask = simd16<bool>>
struct base16 : base<simd16<T>> {
  simdutf_really_inline base16() : base<simd16<T>>() {}

  simdutf_really_inline base16(const __m128i _value)
      : base<simd16<T>>(_value) {}

  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
                                               const simd16<T> rhs) {
    return _mm_cmpeq_epi16(lhs, rhs);
  }

  /// the size of vector in bytes
  static const int SIZE = sizeof(base<simd16<T>>::value);

  /// the number of elements of type T a vector can hold
  static const int ELEMENTS = SIZE / sizeof(T);
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd16<bool> : base16<bool> {
  static simdutf_really_inline simd16<bool> splat(bool _value) {
    return _mm_set1_epi16(uint16_t(-(!!_value)));
  }

  simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}

  simdutf_really_inline int to_bitmask() const {
    return _mm_movemask_epi8(*this);
  }

  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
};

template <typename T> struct base16_numeric : base16<T> {
  static simdutf_really_inline simd16<T> splat(T _value) {
    return _mm_set1_epi16(_value);
  }

  static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }

  static simdutf_really_inline simd16<T> load(const T values[8]) {
    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
  }

  simdutf_really_inline base16_numeric() : base16<T>() {}

  simdutf_really_inline base16_numeric(const __m128i _value)
      : base16<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[8]) const {
    return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
    return _mm_add_epi16(*this, other);
  }
  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
    *this = *this + other;
    return *static_cast<simd16<T> *>(this);
  }
};

// Unsigned code units
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}

  simdutf_really_inline simd16(const __m128i _value)
      : base16_numeric<uint16_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}

  // Array constructor
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}

  // Order-specific operations
  simdutf_really_inline simd16<uint16_t>
  max_val(const simd16<uint16_t> other) const {
    return _mm_max_epu16(*this, other);
  }

  simdutf_really_inline simd16<uint16_t>
  min_val(const simd16<uint16_t> other) const {
    return _mm_min_epu16(*this, other);
  }

  simdutf_really_inline simd16<bool>
  operator<=(const simd16<uint16_t> other) const {
    return other.max_val(*this) == other;
  }
  simdutf_really_inline simd16<bool>
  operator>=(const simd16<uint16_t> other) const {
    return other.min_val(*this) == other;
  }
  // Bit-specific operations
  simdutf_really_inline simd16<bool> bits_not_set() const {
    return *this == uint16_t(0);
  }

  simdutf_really_inline simd16<bool> any_bits_set() const {
    return ~this->bits_not_set();
  }

  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
    return simd16<uint16_t>(_mm_srli_epi16(*this, N));
  }

  // Change the endianness
  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
    const __m128i swap =
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
    return _mm_shuffle_epi8(*this, swap);
  }

  // Pack with the unsigned saturation of two uint16_t code units into single
  // uint8_t vector
  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
                                                   const simd16<uint16_t> &v1) {
    return _mm_packus_epi16(v0, v1);
  }

  simdutf_really_inline uint64_t sum() const {
    const auto lo_u16 = _mm_and_si128(value, _mm_set1_epi32(0x0000ffff));
    const auto hi_u16 = _mm_srli_epi32(value, 16);
    const auto sum_u32 = _mm_add_epi32(lo_u16, hi_u16);

    const auto lo_u32 = _mm_and_si128(sum_u32, _mm_set1_epi64x(0xffffffff));
    const auto hi_u32 = _mm_srli_epi64(sum_u32, 32);
    const auto sum_u64 = _mm_add_epi64(lo_u32, hi_u32);

    return uint64_t(_mm_extract_epi64(sum_u64, 0)) +
           uint64_t(_mm_extract_epi64(sum_u64, 1));
  }
};

template <typename T> struct simd16x32 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
  static_assert(NUM_CHUNKS == 4,
                "Westmere kernel should use four registers per 64-byte block.");
  simd16<T> chunks[NUM_CHUNKS];

  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
  simd16x32<T> &
  operator=(const simd16<T> other) = delete; // no assignment allowed
  simd16x32() = delete;                      // no default constructor allowed

  simdutf_really_inline
  simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
            const simd16<T> chunk2, const simd16<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd16x32(const T *ptr)
      : chunks{simd16<T>::load(ptr),
               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline simd16<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
    this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
    this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r1 = this->chunks[1].to_bitmask();
    uint64_t r2 = this->chunks[2].to_bitmask();
    uint64_t r3 = this->chunks[3].to_bitmask();
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
  }

  simdutf_really_inline void swap_bytes() {
    this->chunks[0] = this->chunks[0].swap_bytes();
    this->chunks[1] = this->chunks[1].swap_bytes();
    this->chunks[2] = this->chunks[2].swap_bytes();
    this->chunks[3] = this->chunks[3].swap_bytes();
  }

  simdutf_really_inline uint64_t lteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
                           this->chunks[2] <= mask, this->chunks[3] <= mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
                           this->chunks[2] >= mask, this->chunks[3] >= mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t eq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
                           this->chunks[2] == mask, this->chunks[3] == mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
    const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
    const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
    return simd16x32<bool>(
               (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
               (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
               (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
               (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
        .to_bitmask();
  }
}; // struct simd16x32<T>

simd16<uint16_t> min(const simd16<uint16_t> a, simd16<uint16_t> b) {
  return _mm_min_epu16(a.value, b.value);
}
/* end file src/simdutf/westmere/simd16-inl.h */
/* begin file src/simdutf/westmere/simd32-inl.h */
template <typename T> struct simd32;

template <> struct simd32<uint32_t> {
  static const size_t SIZE = sizeof(__m128i);
  static const size_t ELEMENTS = SIZE / sizeof(uint32_t);

  __m128i value;

  simdutf_really_inline simd32(const __m128i v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd32(const Pointer *ptr)
      : value(_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr))) {}

  simdutf_really_inline uint64_t sum() const {
    return uint64_t(_mm_extract_epi32(value, 0)) +
           uint64_t(_mm_extract_epi32(value, 1)) +
           uint64_t(_mm_extract_epi32(value, 2)) +
           uint64_t(_mm_extract_epi32(value, 3));
  }

  simdutf_really_inline simd32<uint32_t> swap_bytes() const {
    const __m128i shuffle =
        _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12);

    return _mm_shuffle_epi8(value, shuffle);
  }

  template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
    return _mm_srli_epi32(value, N);
  }

  template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
    return _mm_slli_epi32(value, N);
  }
  void dump() const {
#ifdef SIMDUTF_LOGGING
    printf("[%08x, %08x, %08x, %08x]\n", uint32_t(_mm_extract_epi32(value, 0)),
           uint32_t(_mm_extract_epi32(value, 1)),
           uint32_t(_mm_extract_epi32(value, 2)),
           uint32_t(_mm_extract_epi32(value, 3)));
#endif // SIMDUTF_LOGGING
  }

  // operators
  simdutf_really_inline simd32 &operator+=(const simd32 other) {
    value = _mm_add_epi32(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd32<uint32_t> zero() {
    return _mm_setzero_si128();
  }

  simdutf_really_inline static simd32<uint32_t> splat(uint32_t v) {
    return _mm_set1_epi32(v);
  }
};

//----------------------------------------------------------------------

template <> struct simd32<bool> {
  // static const size_t SIZE = sizeof(__m128i);
  // static const size_t ELEMENTS = SIZE / sizeof(uint32_t);

  __m128i value;

  simdutf_really_inline simd32(const __m128i v) : value(v) {}

  simdutf_really_inline bool any() const {
    return _mm_movemask_epi8(value) != 0;
  }

  simdutf_really_inline uint8_t to_4bit_bitmask() const {
    return uint8_t(_mm_movemask_ps(_mm_castsi128_ps(value)));
  }
};

//----------------------------------------------------------------------

template <typename T>
simdutf_really_inline simd32<T> operator|(const simd32<T> a,
                                          const simd32<T> b) {
  return _mm_or_si128(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> min(const simd32<uint32_t> a,
                                           const simd32<uint32_t> b) {
  return _mm_min_epu32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> max(const simd32<uint32_t> a,
                                           const simd32<uint32_t> b) {
  return _mm_max_epu32(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
                                              uint32_t b) {
  return _mm_cmpeq_epi32(a.value, _mm_set1_epi32(b));
}

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return _mm_and_si128(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                 uint32_t b) {
  return _mm_and_si128(a.value, _mm_set1_epi32(b));
}

simdutf_really_inline simd32<uint32_t> operator|(const simd32<uint32_t> a,
                                                 uint32_t b) {
  return _mm_or_si128(a.value, _mm_set1_epi32(b));
}

simdutf_really_inline simd32<uint32_t> operator+(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return _mm_add_epi32(a.value, b.value);
}

simdutf_really_inline simd32<uint32_t> operator-(const simd32<uint32_t> a,
                                                 uint32_t b) {
  return _mm_sub_epi32(a.value, _mm_set1_epi32(b));
}

simdutf_really_inline simd32<bool> operator==(const simd32<uint32_t> a,
                                              const simd32<uint32_t> b) {
  return _mm_cmpeq_epi32(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator>=(const simd32<uint32_t> a,
                                              const simd32<uint32_t> b) {
  return _mm_cmpeq_epi32(_mm_max_epu32(a.value, b.value), a.value);
}

simdutf_really_inline simd32<bool> operator!(const simd32<bool> v) {
  return _mm_xor_si128(v.value, _mm_set1_epi8(-1));
}

simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return !(b >= a);
}

simdutf_really_inline simd32<uint32_t> select(const simd32<bool> cond,
                                              const simd32<uint32_t> v_true,
                                              const simd32<uint32_t> v_false) {
  return _mm_blendv_epi8(v_false.value, v_true.value, cond.value);
}
/* end file src/simdutf/westmere/simd32-inl.h */
/* begin file src/simdutf/westmere/simd64-inl.h */
template <typename T> struct simd64;

template <> struct simd64<uint64_t> {
  // static const size_t SIZE = sizeof(__m128i);
  // static const size_t ELEMENTS = SIZE / sizeof(uint64_t);

  __m128i value;

  simdutf_really_inline simd64(const __m128i v) : value(v) {}

  template <typename Pointer>
  simdutf_really_inline simd64(const Pointer *ptr)
      : value(_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr))) {}

  simdutf_really_inline uint64_t sum() const {
    return _mm_extract_epi64(value, 0) + _mm_extract_epi64(value, 1);
  }

  // operators
  simdutf_really_inline simd64 &operator+=(const simd64 other) {
    value = _mm_add_epi64(value, other.value);
    return *this;
  }

  // static members
  simdutf_really_inline static simd64<uint64_t> zero() {
    return _mm_setzero_si128();
  }

  simdutf_really_inline static simd64<uint64_t> splat(uint64_t v) {
    return _mm_set1_epi64x(v);
  }
};
/* end file src/simdutf/westmere/simd64-inl.h */

simdutf_really_inline simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
  return _mm_sad_epu8(v.value, simd8<uint8_t>::zero());
}

simdutf_really_inline simd8<uint8_t> as_vector_u8(const simd32<uint32_t> v) {
  return simd8<uint8_t>(v.value);
}

} // namespace simd
} // unnamed namespace
} // namespace westmere
} // namespace simdutf

#endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
/* end file src/simdutf/westmere/simd.h */

/* begin file src/simdutf/westmere/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif

#undef SIMDUTF_SIMD_HAS_BYTEMASK
/* end file src/simdutf/westmere/end.h */

#endif // SIMDUTF_IMPLEMENTATION_WESTMERE
#endif // SIMDUTF_WESTMERE_COMMON_H
/* end file src/simdutf/westmere.h */
/* begin file src/simdutf/ppc64.h */
#ifndef SIMDUTF_PPC64_H
#define SIMDUTF_PPC64_H

#ifdef SIMDUTF_FALLBACK_H
  #error "ppc64.h must be included before fallback.h"
#endif


#ifndef SIMDUTF_IMPLEMENTATION_PPC64
  #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
#endif
#define SIMDUTF_CAN_ALWAYS_RUN_PPC64                                           \
  SIMDUTF_IMPLEMENTATION_PPC64 &&SIMDUTF_IS_PPC64


#if SIMDUTF_IMPLEMENTATION_PPC64

namespace simdutf {
/**
 * Implementation for ALTIVEC (PPC64).
 */
namespace ppc64 {} // namespace ppc64
} // namespace simdutf

/* begin file src/simdutf/ppc64/implementation.h */
#ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
#define SIMDUTF_PPC64_IMPLEMENTATION_H


namespace simdutf {
namespace ppc64 {

namespace {
using namespace simdutf;

template <size_t N> simdutf_really_inline size_t align_down(size_t size) {
  return N * (size / N);
}
} // namespace

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
                                internal::instruction_set::ALTIVEC) {}

  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;

  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;

  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;

  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;

  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t maximal_binary_length_from_base64(
      const char *input, size_t length) const noexcept;
  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;

  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;

  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;

#ifdef SIMDUTF_INTERNAL_TESTS
  virtual std::vector<TestProcedure> internal_tests() const override;
#endif
};

} // namespace ppc64
} // namespace simdutf

#endif // SIMDUTF_PPC64_IMPLEMENTATION_H
/* end file src/simdutf/ppc64/implementation.h */

/* begin file src/simdutf/ppc64/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
// #define SIMDUTF_IMPLEMENTATION ppc64
/* end file src/simdutf/ppc64/begin.h */

  // Declarations
/* begin file src/simdutf/ppc64/intrinsics.h */
#ifndef SIMDUTF_PPC64_INTRINSICS_H
#define SIMDUTF_PPC64_INTRINSICS_H


// This should be the correct header whether
// you use visual studio or other compilers.
#include <altivec.h>

// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
#ifdef bool
  #undef bool
#endif

#ifdef vector
  #undef vector
#endif

#endif //  SIMDUTF_PPC64_INTRINSICS_H
/* end file src/simdutf/ppc64/intrinsics.h */
/* begin file src/simdutf/ppc64/bitmanipulation.h */
#ifndef SIMDUTF_PPC64_BITMANIPULATION_H
#define SIMDUTF_PPC64_BITMANIPULATION_H

namespace simdutf {
namespace ppc64 {
namespace {

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
simdutf_really_inline int count_ones(uint64_t input_num) {
  // note: we do not support legacy 32-bit Windows
  return __popcnt64(input_num); // Visual Studio wants two underscores
}
#else
simdutf_really_inline int count_ones(uint64_t input_num) {
  return __builtin_popcountll(input_num);
}
#endif

#if SIMDUTF_NEED_TRAILING_ZEROES
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
  return __builtin_ctzll(input_num);
}
#endif

} // unnamed namespace
} // namespace ppc64
} // namespace simdutf

#endif // SIMDUTF_PPC64_BITMANIPULATION_H
/* end file src/simdutf/ppc64/bitmanipulation.h */
/* begin file src/simdutf/ppc64/simd.h */
#ifndef SIMDUTF_PPC64_SIMD_H
#define SIMDUTF_PPC64_SIMD_H

#include <type_traits>

namespace simdutf {
namespace ppc64 {
namespace {
namespace simd {

using vec_bool_t = __vector __bool char;
using vec_bool16_t = __vector __bool short;
using vec_bool32_t = __vector __bool int;
using vec_u8_t = __vector unsigned char;
using vec_i8_t = __vector signed char;
using vec_u16_t = __vector unsigned short;
using vec_i16_t = __vector signed short;
using vec_u32_t = __vector unsigned int;
using vec_i32_t = __vector signed int;
using vec_u64_t = __vector unsigned long long;
using vec_i64_t = __vector signed long long;

// clang-format off
template <typename T> struct vector_u8_type_for_element_aux {
  using type = typename std::conditional<std::is_same<T, bool>::value,    vec_bool_t,
               typename std::conditional<std::is_same<T, uint8_t>::value, vec_u8_t,
               typename std::conditional<std::is_same<T, int8_t>::value,  vec_i8_t, void>::type>::type>::type;

  static_assert(not std::is_same<type, void>::value,
                "accepted element types are 8 bit integers or bool");
};

template <typename T> struct vector_u16_type_for_element_aux {
  using type = typename std::conditional<std::is_same<T, bool>::value,     vec_bool16_t,
               typename std::conditional<std::is_same<T, uint16_t>::value, vec_u16_t,
               typename std::conditional<std::is_same<T, int16_t>::value,  vec_i16_t, void>::type>::type>::type;

  static_assert(not std::is_same<type, void>::value,
                "accepted element types are 16 bit integers or bool");
};

template <typename T> struct vector_u32_type_for_element_aux {
  using type = typename std::conditional<std::is_same<T, bool>::value,     vec_bool32_t,
               typename std::conditional<std::is_same<T, uint32_t>::value, vec_u32_t,
               typename std::conditional<std::is_same<T, int32_t>::value,  vec_i32_t, void>::type>::type>::type;

  static_assert(not std::is_same<type, void>::value,
                "accepted element types are 32 bit integers or bool");
};
// clang-format on

template <typename T>
using vector_u8_type_for_element =
    typename vector_u8_type_for_element_aux<T>::type;

template <typename T>
using vector_u16_type_for_element =
    typename vector_u16_type_for_element_aux<T>::type;

template <typename T>
using vector_u32_type_for_element =
    typename vector_u32_type_for_element_aux<T>::type;

template <typename T> uint16_t move_mask_u8(T vec) {
  const vec_u8_t perm_mask = {15 * 8, 14 * 8, 13 * 8, 12 * 8, 11 * 8, 10 * 8,
                              9 * 8,  8 * 8,  7 * 8,  6 * 8,  5 * 8,  4 * 8,
                              3 * 8,  2 * 8,  1 * 8,  0 * 8};

  const auto result = (vec_u64_t)vec_vbpermq((vec_u8_t)vec, perm_mask);
#if SIMDUTF_IS_BIG_ENDIAN
  return static_cast<uint16_t>(result[0]);
#else
  return static_cast<uint16_t>(result[1]);
#endif
}

/* begin file src/simdutf/ppc64/simd8-inl.h */
// file included directly

template <typename T> struct base8 {
  using vector_type = vector_u8_type_for_element<T>;
  vector_type value;
  static const int SIZE = sizeof(vector_type);
  static const int ELEMENTS = sizeof(vector_type) / sizeof(T);

  // Zero constructor
  simdutf_really_inline base8() : value{vec_splats(T(0))} {}

  // Conversion from SIMD register
  simdutf_really_inline base8(const vector_type _value) : value{_value} {}

  // Splat scalar
  simdutf_really_inline base8(T v) : value{vec_splats(v)} {}

  // Conversion to SIMD register
  simdutf_really_inline operator const vector_type &() const {
    return this->value;
  }

  template <typename U> simdutf_really_inline void store(U *ptr) const {
    vec_xst(value, 0, reinterpret_cast<T *>(ptr));
  }

  template <typename SIMD8> void operator|=(const SIMD8 other) {
    this->value = vec_or(this->value, other.value);
  }

  template <int N = 1> vector_type prev_aux(vector_type prev_chunk) const {
    vector_type chunk = this->value;
#if !SIMDUTF_IS_BIG_ENDIAN
    chunk = (vector_type)vec_reve(this->value);
    prev_chunk = (vector_type)vec_reve((vector_type)prev_chunk);
#endif
    chunk = (vector_type)vec_sld((vector_type)prev_chunk, (vector_type)chunk,
                                 16 - N);
#if !SIMDUTF_IS_BIG_ENDIAN
    chunk = (vector_type)vec_reve((vector_type)chunk);
#endif
    return chunk;
  }

  simdutf_really_inline bool is_ascii() const {
    return move_mask_u8(this->value) == 0;
  }

  simdutf_really_inline uint16_t to_bitmask() const {
    return move_mask_u8(value);
  }

  template <endianness big_endian>
  simdutf_really_inline void store_bytes_as_utf16(char16_t *p) const {
    const vector_type zero = vec_splats(T(0));

    if (big_endian) {
      const vec_u8_t perm_lo = {16, 0, 16, 1, 16, 2, 16, 3,
                                16, 4, 16, 5, 16, 6, 16, 7};
      const vec_u8_t perm_hi = {16, 8,  16, 9,  16, 10, 16, 11,
                                16, 12, 16, 13, 16, 14, 16, 15};

      const vector_type v0 = vec_perm(value, zero, perm_lo);
      const vector_type v1 = vec_perm(value, zero, perm_hi);

#if defined(__clang__)
      vec_xst(v0, 0, reinterpret_cast<T *>(p));
      vec_xst(v1, 16, reinterpret_cast<T *>(p));
#else
      vec_xst(v0, 0, reinterpret_cast<vector_type *>(p));
      vec_xst(v1, 16, reinterpret_cast<vector_type *>(p));
#endif // defined(__clang__)
    } else {
      const vec_u8_t perm_lo = {0, 16, 1, 16, 2, 16, 3, 16,
                                4, 16, 5, 16, 6, 16, 7, 16};
      const vec_u8_t perm_hi = {8,  16, 9,  16, 10, 16, 11, 16,
                                12, 16, 13, 16, 14, 16, 15, 16};

      const vector_type v0 = vec_perm(value, zero, perm_lo);
      const vector_type v1 = vec_perm(value, zero, perm_hi);

#if defined(__clang__)
      vec_xst(v0, 0, reinterpret_cast<T *>(p));
      vec_xst(v1, 16, reinterpret_cast<T *>(p));
#else
      vec_xst(v0, 0, reinterpret_cast<vector_type *>(p));
      vec_xst(v1, 16, reinterpret_cast<vector_type *>(p));
#endif // defined(__clang__)
    }
  }

  template <endianness big_endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
    store_bytes_as_utf16<big_endian>(p);
  }

  simdutf_really_inline void store_bytes_as_utf32(char32_t *p) const {
    const vector_type zero = vec_splats(T(0));

#if SIMDUTF_IS_BIG_ENDIAN
    const vec_u8_t perm0 = {16, 16, 16, 0, 16, 16, 16, 1,
                            16, 16, 16, 2, 16, 16, 16, 3};

    const vec_u8_t perm1 = {16, 16, 16, 4, 16, 16, 16, 5,
                            16, 16, 16, 6, 16, 16, 16, 7};

    const vec_u8_t perm2 = {16, 16, 16, 8,  16, 16, 16, 9,
                            16, 16, 16, 10, 16, 16, 16, 11};

    const vec_u8_t perm3 = {16, 16, 16, 12, 16, 16, 16, 13,
                            16, 16, 16, 14, 16, 16, 16, 15};
#else
    const vec_u8_t perm0 = {0, 16, 16, 16, 1, 16, 16, 16,
                            2, 16, 16, 16, 3, 16, 16, 16};

    const vec_u8_t perm1 = {4, 16, 16, 16, 5, 16, 16, 16,
                            6, 16, 16, 16, 7, 16, 16, 16};

    const vec_u8_t perm2 = {8,  16, 16, 16, 9,  16, 16, 16,
                            10, 16, 16, 16, 11, 16, 16, 16};

    const vec_u8_t perm3 = {12, 16, 16, 16, 13, 16, 16, 16,
                            14, 16, 16, 16, 15, 16, 16, 16};
#endif // SIMDUTF_IS_BIG_ENDIAN

    const vector_type v0 = vec_perm(value, zero, perm0);
    const vector_type v1 = vec_perm(value, zero, perm1);
    const vector_type v2 = vec_perm(value, zero, perm2);
    const vector_type v3 = vec_perm(value, zero, perm3);

    constexpr size_t n = base8<T>::SIZE;

#if defined(__clang__)
    vec_xst(v0, 0 * n, reinterpret_cast<T *>(p));
    vec_xst(v1, 1 * n, reinterpret_cast<T *>(p));
    vec_xst(v2, 2 * n, reinterpret_cast<T *>(p));
    vec_xst(v3, 3 * n, reinterpret_cast<T *>(p));
#else
    vec_xst(v0, 0 * n, reinterpret_cast<vector_type *>(p));
    vec_xst(v1, 1 * n, reinterpret_cast<vector_type *>(p));
    vec_xst(v2, 2 * n, reinterpret_cast<vector_type *>(p));
    vec_xst(v3, 3 * n, reinterpret_cast<vector_type *>(p));
#endif // defined(__clang__)
  }

  simdutf_really_inline void store_words_as_utf32(char32_t *p) const {
    const vector_type zero = vec_splats(T(0));

#if SIMDUTF_IS_BIG_ENDIAN
    const vec_u8_t perm0 = {16, 16, 0, 1, 16, 16, 2, 3,
                            16, 16, 4, 5, 16, 16, 6, 7};
    const vec_u8_t perm1 = {16, 16, 8,  9,  16, 16, 10, 11,
                            16, 16, 12, 13, 16, 16, 14, 15};
#else
    const vec_u8_t perm0 = {0, 1, 16, 16, 2, 3, 16, 16,
                            4, 5, 16, 16, 6, 7, 16, 16};
    const vec_u8_t perm1 = {8,  9,  16, 16, 10, 11, 16, 16,
                            12, 13, 16, 16, 14, 15, 16, 16};
#endif // SIMDUTF_IS_BIG_ENDIAN

    const vector_type v0 = vec_perm(value, zero, perm0);
    const vector_type v1 = vec_perm(value, zero, perm1);

    constexpr size_t n = base8<T>::SIZE;

#if defined(__clang__)
    vec_xst(v0, 0 * n, reinterpret_cast<T *>(p));
    vec_xst(v1, 1 * n, reinterpret_cast<T *>(p));
#else
    vec_xst(v0, 0 * n, reinterpret_cast<vector_type *>(p));
    vec_xst(v1, 1 * n, reinterpret_cast<vector_type *>(p));
#endif // defined(__clang__)
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
    store_bytes_as_utf32(p);
  }
};

// Forward declaration
template <typename T> struct simd8;

template <typename T>
simd8<bool> operator==(const simd8<T> a, const simd8<T> b);

template <typename T>
simd8<bool> operator!=(const simd8<T> a, const simd8<T> b);

template <typename T> simd8<T> operator&(const simd8<T> a, const simd8<T> b);

template <typename T> simd8<T> operator|(const simd8<T> a, const simd8<T> b);

template <typename T> simd8<T> operator^(const simd8<T> a, const simd8<T> b);

template <typename T> simd8<T> operator+(const simd8<T> a, const simd8<T> b);

template <typename T> simd8<bool> operator<(const simd8<T> a, const simd8<T> b);

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base8<bool> {
  using super = base8<bool>;

  static simdutf_really_inline simd8<bool> splat(bool _value) {
    return (vector_type)vec_splats((unsigned char)(-(!!_value)));
  }

  simdutf_really_inline simd8() : super(vector_type()) {}
  simdutf_really_inline simd8(const vector_type _value) : super(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}

  template <typename T>
  simdutf_really_inline simd8(simd8<T> other)
      : simd8(vector_type(other.value)) {}

  simdutf_really_inline uint16_t to_bitmask() const {
    return move_mask_u8(value);
  }

  simdutf_really_inline bool any() const {
    return !vec_all_eq(this->value, (vector_type)vec_splats(0));
  }

  simdutf_really_inline bool all() const { return to_bitmask() == 0xffff; }

  simdutf_really_inline simd8<bool> operator~() const {
    return this->value ^ (vector_type)splat(true);
  }
};

template <typename T> struct base8_numeric : base8<T> {
  using super = base8<T>;
  using vector_type = typename super::vector_type;

  static simdutf_really_inline simd8<T> splat(T value) {
    return (vector_type)vec_splats(value);
  }

  static simdutf_really_inline simd8<T> zero() { return splat(0); }

  template <typename U>
  static simdutf_really_inline simd8<T> load(const U *values) {
    return vec_xl(0, reinterpret_cast<const T *>(values));
  }

  // Repeat 16 values as many times as necessary (usually for lookup tables)
  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
                                                  T v5, T v6, T v7, T v8, T v9,
                                                  T v10, T v11, T v12, T v13,
                                                  T v14, T v15) {
    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
                    v14, v15);
  }

  simdutf_really_inline base8_numeric() : base8<T>() {}
  simdutf_really_inline base8_numeric(const vector_type _value)
      : base8<T>(_value) {}

  // Override to distinguish from bool version
  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }

  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
    this->value = vec_sub(this->value, other.value);
    return *static_cast<simd8<T> *>(this);
  }

  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
  // for out of range values)
  template <typename L>
  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
    return (vector_type)vec_perm((vector_type)lookup_table,
                                 (vector_type)lookup_table, this->value);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_32(const simd8<L> lookup_table_lo,
            const simd8<L> lookup_table_hi) const {
    return (vector_type)vec_perm(lookup_table_lo.value, lookup_table_hi.value,
                                 this->value);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
            L replace5, L replace6, L replace7, L replace8, L replace9,
            L replace10, L replace11, L replace12, L replace13, L replace14,
            L replace15) const {
    return lookup_16(simd8<L>::repeat_16(
        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
        replace7, replace8, replace9, replace10, replace11, replace12,
        replace13, replace14, replace15));
  }
};

// Unsigned bytes
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
  using Self = simd8<uint8_t>;

  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
  simdutf_really_inline simd8(const vector_type _value)
      : base8_numeric<uint8_t>(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
  // Member-by-member initialization
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
      : simd8((vector_type){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
                            v12, v13, v14, v15}) {}
  // Repeat 16 values as many times as necessary (usually for lookup tables)
  simdutf_really_inline static simd8<uint8_t>
  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
            uint8_t v15) {
    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
                          v13, v14, v15);
  }

  simdutf_really_inline bool is_ascii() const {
    return move_mask_u8(this->value) == 0;
  }

  template <typename T>
  simdutf_really_inline simd8(simd8<T> other)
      : simd8(vector_type(other.value)) {}

  template <int N>
  simdutf_really_inline Self prev(const Self prev_chunk) const {
    return prev_aux<N>(prev_chunk.value);
  }

  // Saturated math
  simdutf_really_inline simd8<uint8_t>
  saturating_sub(const simd8<uint8_t> other) const {
    return (vector_type)vec_subs(this->value, (vector_type)other);
  }

  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd8<uint8_t>
  gt_bits(const simd8<uint8_t> other) const {
    return this->saturating_sub(other);
  }

  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd8<uint8_t>
  lt_bits(const simd8<uint8_t> other) const {
    return other.saturating_sub(*this);
  }

  // Bit-specific operations
  simdutf_really_inline bool bits_not_set_anywhere() const {
    return vec_all_eq(this->value, (vector_type)vec_splats(0));
  }

  simdutf_really_inline bool any_bits_set_anywhere() const {
    return !bits_not_set_anywhere();
  }

  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
    return simd8<uint8_t>(
        (vector_type)vec_sr(this->value, (vector_type)vec_splat_u8(N)));
  }

  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
    return simd8<uint8_t>(
        (vector_type)vec_sl(this->value, (vector_type)vec_splat_u8(N)));
  }
  void dump() const {
#ifdef SIMDUTF_LOGGING
    uint8_t tmp[16];
    store(tmp);
    for (int i = 0; i < 16; i++) {
      if (i == 0) {
        printf("[%02x", tmp[i]);
      } else if (i == 15) {
        printf(" %02x]", tmp[i]);
      } else {
        printf(" %02x", tmp[i]);
      }
    }
    putchar('\n');
#endif // SIMDUTF_LOGGING
  }

  void dump_ascii() const {
#ifdef SIMDUTF_LOGGING
    uint8_t tmp[16];
    store(tmp);
    for (int i = 0; i < 16; i++) {
      if (i == 0) {
        printf("[%c", tmp[i]);
      } else if (i == 15) {
        printf("%c]", tmp[i]);
      } else {
        printf("%c", tmp[i]);
      }
    }
    putchar('\n');
#endif // SIMDUTF_LOGGING
  }
};

// Signed bytes
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
  simdutf_really_inline simd8(const vector_type _value)
      : base8_numeric<int8_t>(_value) {}

  template <typename T>
  simdutf_really_inline simd8(simd8<T> other)
      : simd8(vector_type(other.value)) {}

  // Splat constructor
  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}

  simdutf_really_inline operator simd8<uint8_t>() const;

  // Saturated math
  simdutf_really_inline simd8<int8_t>
  saturating_add(const simd8<int8_t> other) const {
    return (vector_type)vec_adds(this->value, other.value);
  }

  void dump() const {
    int8_t tmp[16];
    store(tmp);
    for (int i = 0; i < 16; i++) {
      if (i == 0) {
        printf("[%02x", tmp[i]);
      } else if (i == 15) {
        printf("%02x]", tmp[i]);
      } else {
        printf("%02x", tmp[i]);
      }
    }
    putchar('\n');
  }
};

template <typename T>
simd8<bool> operator==(const simd8<T> a, const simd8<T> b) {
  return vec_cmpeq(a.value, b.value);
}

template <typename T>
simd8<bool> operator!=(const simd8<T> a, const simd8<T> b) {
  return vec_cmpne(a.value, b.value);
}

template <typename T> simd8<T> operator&(const simd8<T> a, const simd8<T> b) {
  return vec_and(a.value, b.value);
}

template <typename T, typename U> simd8<T> operator&(const simd8<T> a, U b) {
  return vec_and(a.value, vec_splats(T(b)));
}

template <typename T> simd8<T> operator|(const simd8<T> a, const simd8<T> b) {
  return vec_or(a.value, b.value);
}

template <typename T> simd8<T> operator^(const simd8<T> a, const simd8<T> b) {
  return vec_xor(a.value, b.value);
}

template <typename T, typename U> simd8<T> operator^(const simd8<T> a, U b) {
  return vec_xor(a.value, vec_splats(T(b)));
}

template <typename T> simd8<T> operator+(const simd8<T> a, const simd8<T> b) {
  return vec_add(a.value, b.value);
}

template <typename T, typename U> simd8<T> operator+(const simd8<T> a, U b) {
  return vec_add(a.value, vec_splats(T(b)));
}

simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
  return (simd8<uint8_t>::vector_type)value;
}

template <typename T>
simd8<bool> operator<(const simd8<T> a, const simd8<T> b) {
  return vec_cmplt(a.value, b.value);
}

template <typename T>
simd8<bool> operator>(const simd8<T> a, const simd8<T> b) {
  return vec_cmpgt(a.value, b.value);
}

template <typename T>
simd8<bool> operator>=(const simd8<T> a, const simd8<T> b) {
  return vec_cmpge(a.value, b.value);
}

template <typename T> struct simd8x64 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
  static constexpr size_t ELEMENTS = simd8<T>::ELEMENTS;

  static_assert(NUM_CHUNKS == 4,
                "PPC64 kernel should use four registers per 64-byte block.");
  simd8<T> chunks[NUM_CHUNKS];

  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
  simd8x64<T> &
  operator=(const simd8<T> other) = delete; // no assignment allowed
  simd8x64() = delete;                      // no default constructor allowed
  simd8x64(simd8x64<T> &&) = default;

  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
                                 const simd8<T> chunk2, const simd8<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd8x64(const T *ptr)
      : chunks{simd8<T>::load(ptr),
               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + ELEMENTS * 0);
    this->chunks[1].store(ptr + ELEMENTS * 1);
    this->chunks[2].store(ptr + ELEMENTS * 2);
    this->chunks[3].store(ptr + ELEMENTS * 3);
  }

  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
    this->chunks[0] |= other.chunks[0];
    this->chunks[1] |= other.chunks[1];
    this->chunks[2] |= other.chunks[2];
    this->chunks[3] |= other.chunks[3];
    return *this;
  }

  simdutf_really_inline simd8<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  template <endianness endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 0);
    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 1);
    this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 2);
    this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
    this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
    this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
    this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r1 = this->chunks[1].to_bitmask();
    uint64_t r2 = this->chunks[2].to_bitmask();
    uint64_t r3 = this->chunks[3].to_bitmask();
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
  }

  simdutf_really_inline uint64_t lt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
                          this->chunks[2] < mask, this->chunks[3] < mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
                          this->chunks[2] > mask, this->chunks[3] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t eq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
                          this->chunks[2] == mask, this->chunks[3] == mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
    return simd8x64<bool>(simd8<uint8_t>(this->chunks[0]) >= mask,
                          simd8<uint8_t>(this->chunks[1]) >= mask,
                          simd8<uint8_t>(this->chunks[2]) >= mask,
                          simd8<uint8_t>(this->chunks[3]) >= mask)
        .to_bitmask();
  }

  void dump() const {
    puts("");
    for (int i = 0; i < 4; i++) {
      printf("chunk[%d] = ", i);
      this->chunks[i].dump();
    }
  }
}; // struct simd8x64<T>

simdutf_really_inline simd8<uint8_t> avg(const simd8<uint8_t> a,
                                         const simd8<uint8_t> b) {
  return vec_avg(a.value, b.value);
}
/* end file src/simdutf/ppc64/simd8-inl.h */
/* begin file src/simdutf/ppc64/simd16-inl.h */
// file included directly

template <typename T> struct simd16;

template <typename T> struct base16 {
  using vector_type = vector_u16_type_for_element<T>;
  static const int SIZE = sizeof(vector_type);
  static const int ELEMENTS = sizeof(vector_type) / sizeof(T);

  vector_type value;

  // Zero constructor
  simdutf_really_inline base16() : value{vector_type()} {}

  // Conversion from SIMD register
  simdutf_really_inline base16(const vector_type _value) : value{_value} {}
  void dump() const {
#ifdef SIMDUTF_LOGGING
    uint16_t tmp[8];
    vec_xst(value, 0, reinterpret_cast<vector_type *>(tmp));
    for (int i = 0; i < 8; i++) {
      if (i == 0) {
        printf("[%04x", tmp[i]);
      } else if (i == 8 - 1) {
        printf(" %04x]", tmp[i]);
      } else {
        printf(" %04x", tmp[i]);
      }
    }
    putchar('\n');
#endif // SIMDUTF_LOGGING
  }
};

// Forward declaration
template <typename> struct simd16;

template <typename T>
simd16<bool> operator==(const simd16<T> a, const simd16<T> b);

template <typename T, typename U>
simd16<bool> operator==(const simd16<T> a, U b);

template <typename T> simd16<T> operator&(const simd16<T> a, const simd16<T> b);

template <typename T> simd16<T> operator|(const simd16<T> a, const simd16<T> b);

template <typename T, typename U> simd16<T> operator|(const simd16<T> a, U b);

template <typename T, typename U> simd16<T> operator^(const simd16<T> a, U b);

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd16<bool> : base16<bool> {
  static simdutf_really_inline simd16<bool> splat(bool _value) {
    return (vector_type)vec_splats(uint16_t(-(!!_value)));
  }

  simdutf_really_inline simd16() : base16() {}

  simdutf_really_inline simd16(const vector_type _value)
      : base16<bool>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}

  simdutf_really_inline uint16_t to_bitmask() const {
    return move_mask_u8(value);
  }

  simdutf_really_inline bool any() const {
    const auto tmp = vec_u64_t(value);

    return tmp[0] || tmp[1]; // Note: logical or, not binary one
  }

  simdutf_really_inline bool is_zero() const {
    const auto tmp = vec_u64_t(value);

    return (tmp[0] | tmp[1]) == 0;
  }

  simdutf_really_inline simd16<bool> &operator|=(const simd16<bool> rhs) {
    value = vec_or(this->value, rhs.value);
    return *this;
  }
};

template <typename T> struct base16_numeric : base16<T> {
  using vector_type = typename base16<T>::vector_type;

  static simdutf_really_inline simd16<T> splat(T _value) {
    return vec_splats(_value);
  }

  static simdutf_really_inline simd16<T> zero() { return splat(0); }

  template <typename U>
  static simdutf_really_inline simd16<T> load(const U *ptr) {
    return vec_xl(0, reinterpret_cast<const T *>(ptr));
  }

  simdutf_really_inline base16_numeric() : base16<T>() {}
  simdutf_really_inline base16_numeric(const vector_type _value)
      : base16<T>(_value) {}

  // Store to array
  template <typename U> simdutf_really_inline void store(U *dst) const {
#if defined(__clang__)
    return vec_xst(this->value, 0, reinterpret_cast<T *>(dst));
#else
    return vec_xst(this->value, 0, reinterpret_cast<vector_type *>(dst));
#endif // defined(__clang__)
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd16<T> operator~() const {
    return vec_xor(this->value, vec_splats(T(0xffff)));
  }
};

// Signed code units
template <> struct simd16<int16_t> : base16_numeric<int16_t> {
  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
  simdutf_really_inline simd16(const vector_type _value)
      : base16_numeric<int16_t>(_value) {}
  // Splat constructor
  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
  // Array constructor
  simdutf_really_inline operator simd16<uint16_t>() const;
};

// Unsigned code units
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
  simdutf_really_inline simd16(const vector_type _value)
      : base16_numeric<uint16_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}

  // Array constructor
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}

  simdutf_really_inline bool is_ascii() const {
    return vec_all_lt(value, vec_splats(uint16_t(128)));
  }

  // Order-specific operations
  simdutf_really_inline simd16<uint16_t>
  max_val(const simd16<uint16_t> other) const {
    return vec_max(this->value, other.value);
  }
  simdutf_really_inline simd16<uint16_t>
  min_val(const simd16<uint16_t> other) const {
    return vec_min(this->value, other.value);
  }
  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd16<bool>
  operator<=(const simd16<uint16_t> other) const {
    return other.max_val(*this) == other;
  }

  simdutf_really_inline simd16<bool>
  operator>=(const simd16<uint16_t> other) const {
    return other.min_val(*this) == other;
  }

  simdutf_really_inline simd16<bool>
  operator<(const simd16<uint16_t> other) const {
    return vec_cmplt(value, other.value);
  }

  // Bit-specific operations
  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
    return vec_sr(value, vec_splats(uint16_t(N)));
  }

  template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
    return vec_sl(value, vec_splats(uint16_t(N)));
  }

  // Change the endianness
  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
    return vec_revb(value);
  }

  // Pack with the unsigned saturation of two uint16_t code units into single
  // uint8_t vector
  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
                                                   const simd16<uint16_t> &v1) {
    return vec_packs(v0.value, v1.value);
  }
};

template <typename T>
simd16<bool> operator==(const simd16<T> a, const simd16<T> b) {
  return vec_cmpeq(a.value, b.value);
}

template <typename T, typename U>
simd16<bool> operator==(const simd16<T> a, U b) {
  return vec_cmpeq(a.value, vec_splats(T(b)));
}

template <typename T>
simd16<T> operator&(const simd16<T> a, const simd16<T> b) {
  return vec_and(a.value, b.value);
}

template <typename T, typename U> simd16<T> operator&(const simd16<T> a, U b) {
  return vec_and(a.value, vec_splats(T(b)));
}

template <typename T>
simd16<T> operator|(const simd16<T> a, const simd16<T> b) {
  return vec_or(a.value, b.value);
}

template <typename T, typename U> simd16<T> operator|(const simd16<T> a, U b) {
  return vec_or(a.value, vec_splats(T(b)));
}

template <typename T>
simd16<T> operator^(const simd16<T> a, const simd16<T> b) {
  return vec_xor(a.value, b.value);
}

template <typename T, typename U> simd16<T> operator^(const simd16<T> a, U b) {
  return vec_xor(a.value, vec_splats(T(b)));
}

simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
  return (vec_u16_t)(value);
}

template <typename T> struct simd16x32 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
  static_assert(NUM_CHUNKS == 4,
                "AltiVec kernel should use four registers per 64-byte block.");
  simd16<T> chunks[NUM_CHUNKS];

  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
  simd16x32<T> &
  operator=(const simd16<T> other) = delete; // no assignment allowed
  simd16x32() = delete;                      // no default constructor allowed

  simdutf_really_inline
  simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
            const simd16<T> chunk2, const simd16<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd16x32(const T *ptr)
      : chunks{simd16<T>::load(ptr),
               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline simd16<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
    this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
    this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r1 = this->chunks[1].to_bitmask();
    uint64_t r2 = this->chunks[2].to_bitmask();
    uint64_t r3 = this->chunks[3].to_bitmask();
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
  }

  simdutf_really_inline void swap_bytes() {
    this->chunks[0] = this->chunks[0].swap_bytes();
    this->chunks[1] = this->chunks[1].swap_bytes();
    this->chunks[2] = this->chunks[2].swap_bytes();
    this->chunks[3] = this->chunks[3].swap_bytes();
  }

  simdutf_really_inline uint64_t gt(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
                           this->chunks[2] > mask, this->chunks[3] > mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t lteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
                           this->chunks[2] <= mask, this->chunks[3] <= mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t eq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
                           this->chunks[2] == mask, this->chunks[3] == mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
    const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
    const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
    return simd16x32<bool>(
               (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
               (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
               (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
               (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
        .to_bitmask();
  }
}; // struct simd16x32<T>
/* end file src/simdutf/ppc64/simd16-inl.h */
/* begin file src/simdutf/ppc64/simd32-inl.h */
// file included directly

template <typename T> struct simd32;

template <typename T> struct base32 {
  using vector_type = vector_u32_type_for_element<T>;
  static const int SIZE = sizeof(vector_type);
  static const int ELEMENTS = sizeof(vector_type) / sizeof(T);

  vector_type value;

  // Zero constructor
  simdutf_really_inline base32() : value{vector_type()} {}

  // Conversion from SIMD register
  simdutf_really_inline base32(const vector_type _value) : value{_value} {}

  // Splat for scalar
  simdutf_really_inline base32(T scalar) : value{vec_splats(scalar)} {}

  template <typename Pointer>
  simdutf_really_inline base32(const Pointer *ptr)
      : base32(vec_xl(0, reinterpret_cast<const T *>(ptr))) {}

  // Store to array
  template <typename U> simdutf_really_inline void store(U *dst) const {
#if defined(__clang__)
    return vec_xst(this->value, 0, reinterpret_cast<T *>(dst));
#else
    return vec_xst(this->value, 0, reinterpret_cast<vector_type *>(dst));
#endif // defined(__clang__)
  }
  void dump(const char *name = nullptr) const {
#ifdef SIMDUTF_LOGGING
    if (name != nullptr) {
      printf("%-10s = ", name);
    }

    uint32_t tmp[4];
    vec_xst(value, 0, reinterpret_cast<vector_type *>(tmp));
    for (int i = 0; i < 4; i++) {
      if (i == 0) {
        printf("[%08x", tmp[i]);
      } else if (i == 4 - 1) {
        printf(" %08x]", tmp[i]);
      } else {
        printf(" %08x", tmp[i]);
      }
    }
    putchar('\n');
#endif // SIMDUTF_LOGGING
  }
};

template <typename T> struct base32_numeric : base32<T> {
  using super = base32<T>;
  using vector_type = typename super::vector_type;

  static simdutf_really_inline simd32<T> splat(T _value) {
    return vec_splats(_value);
  }

  static simdutf_really_inline simd32<T> zero() { return splat(0); }

  template <typename U>
  static simdutf_really_inline simd32<T> load(const U *values) {
    return vec_xl(0, reinterpret_cast<const T *>(values));
  }

  simdutf_really_inline base32_numeric() : base32<T>() {}

  simdutf_really_inline base32_numeric(const vector_type _value)
      : base32<T>(_value) {}

  // Addition/subtraction are the same for signed and unsigned
  simdutf_really_inline simd32<T> operator+(const simd32<T> other) const {
    return vec_add(this->value, other.value);
  }

  simdutf_really_inline simd32<T> operator-(const simd32<T> other) const {
    return vec_sub(this->value, other.value);
  }

  simdutf_really_inline simd32<T> &operator+=(const simd32<T> other) {
    *this = *this + other;
    return *static_cast<simd32<T> *>(this);
  }

  simdutf_really_inline simd32<T> &operator-=(const simd32<T> other) {
    *this = *this - other;
    return *static_cast<simd32<T> *>(this);
  }
};

// Forward declaration
template <typename> struct simd32;

template <typename T>
simd32<bool> operator==(const simd32<T> a, const simd32<T> b);

template <typename T>
simd32<bool> operator!=(const simd32<T> a, const simd32<T> b);

template <typename T>
simd32<bool> operator>(const simd32<T> a, const simd32<T> b);

template <typename T> simd32<bool> operator==(const simd32<T> a, T b);

template <typename T> simd32<bool> operator!=(const simd32<T> a, T b);

template <typename T> simd32<T> operator&(const simd32<T> a, const simd32<T> b);

template <typename T> simd32<T> operator|(const simd32<T> a, const simd32<T> b);

template <typename T> simd32<T> operator^(const simd32<T> a, const simd32<T> b);

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd32<bool> : base32<bool> {
  static simdutf_really_inline simd32<bool> splat(bool _value) {
    return (vector_type)vec_splats(uint32_t(-(!!_value)));
  }

  simdutf_really_inline simd32(const vector_type _value)
      : base32<bool>(_value) {}

  // Splat constructor
  simdutf_really_inline simd32(bool _value) : base32<bool>(splat(_value)) {}

  simdutf_really_inline uint16_t to_bitmask() const {
    return move_mask_u8(value);
  }

  simdutf_really_inline bool any() const {
    const vec_u64_t tmp = (vec_u64_t)value;

    return tmp[0] || tmp[1]; // Note: logical or, not binary one
  }

  simdutf_really_inline bool is_zero() const {
    const vec_u64_t tmp = (vec_u64_t)value;

    return (tmp[0] | tmp[1]) == 0;
  }

  simdutf_really_inline simd32<bool> operator~() const {
    return (vec_bool32_t)vec_xor(this->value, vec_splats(uint32_t(0xffffffff)));
  }
};

// Unsigned code units
template <> struct simd32<uint32_t> : base32_numeric<uint32_t> {
  simdutf_really_inline simd32() : base32_numeric<uint32_t>() {}

  simdutf_really_inline simd32(const vector_type _value)
      : base32_numeric<uint32_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd32(uint32_t _value) : simd32(splat(_value)) {}

  // Array constructor
  simdutf_really_inline simd32(const char32_t *values)
      : simd32(load(reinterpret_cast<const uint32_t *>(values))) {}

  // Bit-specific operations
  template <int N> simdutf_really_inline simd32<uint32_t> shr() const {
    return vec_sr(value, vec_splats(uint32_t(N)));
  }

  template <int N> simdutf_really_inline simd32<uint32_t> shl() const {
    return vec_sl(value, vec_splats(uint32_t(N)));
  }

  // Change the endianness
  simdutf_really_inline simd32<uint32_t> swap_bytes() const {
    return vec_revb(value);
  }

  simdutf_really_inline uint64_t sum() const {
    return uint64_t(value[0]) + uint64_t(value[1]) + uint64_t(value[2]) +
           uint64_t(value[3]);
  }

  static simdutf_really_inline simd16<uint16_t>
  pack(const simd32<uint32_t> &v0, const simd32<uint32_t> &v1) {
    return vec_packs(v0.value, v1.value);
  }
};

template <typename T>
simd32<bool> operator==(const simd32<T> a, const simd32<T> b) {
  return vec_cmpeq(a.value, b.value);
}

template <typename T>
simd32<bool> operator!=(const simd32<T> a, const simd32<T> b) {
  return vec_cmpne(a.value, b.value);
}

template <typename T> simd32<bool> operator==(const simd32<T> a, T b) {
  return vec_cmpeq(a.value, vec_splats(b));
}

template <typename T> simd32<bool> operator!=(const simd32<T> a, T b) {
  return vec_cmpne(a.value, vec_splats(b));
}

template <typename T>
simd32<bool> operator>(const simd32<T> a, const simd32<T> b) {
  return vec_cmpgt(a.value, b.value);
}

template <typename T>
simd32<bool> operator>=(const simd32<T> a, const simd32<T> b) {
  return vec_cmpge(a.value, b.value);
}

template <typename T>
simd32<T> operator&(const simd32<T> a, const simd32<T> b) {
  return vec_and(a.value, b.value);
}

template <typename T, typename U> simd32<T> operator&(const simd32<T> a, U b) {
  return vec_and(a.value, vec_splats(T(b)));
}

template <typename T>
simd32<T> operator|(const simd32<T> a, const simd32<T> b) {
  return vec_or(a.value, b.value);
}

template <typename T>
simd32<T> operator^(const simd32<T> a, const simd32<T> b) {
  return vec_xor(a.value, b.value);
}

template <typename T, typename U> simd32<T> operator^(const simd32<T> a, U b) {
  return vec_xor(a.value, vec_splats(T(b)));
}

template <typename T> simd32<T> max_val(const simd32<T> a, const simd32<T> b) {
  return vec_max(a.value, b.value);
}

template <typename T>
simdutf_really_inline simd32<T> min(const simd32<T> b, const simd32<T> a) {
  return vec_min(a.value, b.value);
}
/* end file src/simdutf/ppc64/simd32-inl.h */

template <typename T>
simd8<T> select(const simd8<T> cond, const simd8<T> val_true,
                const simd8<T> val_false) {
  return vec_sel(val_false.value, val_true.value, cond.value);
}

template <typename T>
simd8<T> select(const T cond, const simd8<T> val_true,
                const simd8<T> val_false) {
  return vec_sel(val_false.value, val_true.value, vec_splats(cond));
}

template <typename T>
simd16<T> select(const simd16<T> cond, const simd16<T> val_true,
                 const simd16<T> val_false) {
  return vec_sel(val_false.value, val_true.value, cond.value);
}

template <typename T>
simd16<T> select(const T cond, const simd16<T> val_true,
                 const simd16<T> val_false) {
  return vec_sel(val_false.value, val_true.value, vec_splats(cond));
}

template <typename T>
simd32<T> select(const simd32<T> cond, const simd32<T> val_true,
                 const simd32<T> val_false) {
  return vec_sel(val_false.value, val_true.value, cond.value);
}

template <typename T>
simd32<T> select(const T cond, const simd32<T> val_true,
                 const simd32<T> val_false) {
  return vec_sel(val_false.value, val_true.value, vec_splats(cond));
}

using vector_u8 = simd8<uint8_t>;
using vector_u16 = simd16<uint16_t>;
using vector_u32 = simd32<uint32_t>;
using vector_i8 = simd8<int8_t>;

simdutf_really_inline vector_u8 as_vector_u8(const vector_u16 v) {
  return vector_u8::vector_type(v.value);
}

simdutf_really_inline vector_u8 as_vector_u8(const vector_u32 v) {
  return vector_u8::vector_type(v.value);
}

simdutf_really_inline vector_u8 as_vector_u8(const vector_i8 v) {
  return vector_u8::vector_type(v.value);
}

simdutf_really_inline vector_u8 as_vector_u8(const simd16<bool> v) {
  return vector_u8::vector_type(v.value);
}

simdutf_really_inline vector_i8 as_vector_i8(const vector_u8 v) {
  return vector_i8::vector_type(v.value);
}

simdutf_really_inline vector_u16 as_vector_u16(const vector_u8 v) {
  return vector_u16::vector_type(v.value);
}

simdutf_really_inline vector_u16 as_vector_u16(const simd16<bool> v) {
  return vector_u16::vector_type(v.value);
}

simdutf_really_inline vector_u32 as_vector_u32(const vector_u8 v) {
  return vector_u32::vector_type(v.value);
}

simdutf_really_inline vector_u32 as_vector_u32(const vector_u16 v) {
  return vector_u32::vector_type(v.value);
}

simdutf_really_inline vector_u32 max(vector_u32 a, vector_u32 b) {
  return vec_max(a.value, b.value);
}

simdutf_really_inline vector_u32 max(vector_u32 a, vector_u32 b, vector_u32 c) {
  return max(max(a, b), c);
}

simdutf_really_inline vector_u32 sum4bytes(vector_u8 bytes, vector_u32 acc) {
  return vec_sum4s(bytes.value, acc.value);
}

} // namespace simd
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf

#endif // SIMDUTF_PPC64_SIMD_INPUT_H
/* end file src/simdutf/ppc64/simd.h */

/* begin file src/simdutf/ppc64/end.h */
/* end file src/simdutf/ppc64/end.h */

#endif // SIMDUTF_IMPLEMENTATION_PPC64

#endif // SIMDUTF_PPC64_H
/* end file src/simdutf/ppc64.h */
/* begin file src/simdutf/rvv.h */
#ifndef SIMDUTF_RVV_H
#define SIMDUTF_RVV_H

#ifdef SIMDUTF_FALLBACK_H
  #error "rvv.h must be included before fallback.h"
#endif


#define SIMDUTF_CAN_ALWAYS_RUN_RVV SIMDUTF_IS_RVV

#ifndef SIMDUTF_IMPLEMENTATION_RVV
  #define SIMDUTF_IMPLEMENTATION_RVV                                           \
    (SIMDUTF_CAN_ALWAYS_RUN_RVV ||                                             \
     (SIMDUTF_IS_RISCV64 && SIMDUTF_HAS_RVV_INTRINSICS &&                      \
      SIMDUTF_HAS_RVV_TARGET_REGION))
#endif

#if SIMDUTF_IMPLEMENTATION_RVV

  #if SIMDUTF_CAN_ALWAYS_RUN_RVV
    #define SIMDUTF_TARGET_RVV
  #else
    #define SIMDUTF_TARGET_RVV SIMDUTF_TARGET_REGION("arch=+v")
  #endif
  #if !SIMDUTF_IS_ZVBB && SIMDUTF_HAS_ZVBB_INTRINSICS
    #define SIMDUTF_TARGET_ZVBB SIMDUTF_TARGET_REGION("arch=+v,+zvbb")
  #endif

namespace simdutf {
namespace rvv {} // namespace rvv
} // namespace simdutf

/* begin file src/simdutf/rvv/implementation.h */
#ifndef SIMDUTF_RVV_IMPLEMENTATION_H
#define SIMDUTF_RVV_IMPLEMENTATION_H


namespace simdutf {
namespace rvv {

namespace {
using namespace simdutf;
} // namespace

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("rvv", "RISC-V Vector Extension",
                                internal::instruction_set::RVV),
        _supports_zvbb(internal::detect_supported_architectures() &
                       internal::instruction_set::ZVBB) {}
  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;
  simdutf_warn_unused size_t utf32_length_from_utf16le(
      const char16_t *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf32_length_from_utf16be(
      const char16_t *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;

  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
private:
  const bool _supports_zvbb;

#if SIMDUTF_IS_ZVBB
  bool supports_zvbb() const { return true; }
#elif SIMDUTF_HAS_ZVBB_INTRINSICS
  bool supports_zvbb() const { return _supports_zvbb; }
#else
  bool supports_zvbb() const { return false; }
#endif
};

} // namespace rvv
} // namespace simdutf

#endif // SIMDUTF_RVV_IMPLEMENTATION_H
/* end file src/simdutf/rvv/implementation.h */
/* begin file src/simdutf/rvv/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "rvv"
// #define SIMDUTF_IMPLEMENTATION rvv

#if SIMDUTF_CAN_ALWAYS_RUN_RVV
// nothing needed.
#else
SIMDUTF_TARGET_RVV
#endif
/* end file src/simdutf/rvv/begin.h */
/* begin file src/simdutf/rvv/intrinsics.h */
#ifndef SIMDUTF_RVV_INTRINSICS_H
#define SIMDUTF_RVV_INTRINSICS_H


#include <riscv_vector.h>

#if __riscv_v_intrinsic >= 1000000 || __GCC__ >= 14
  #define simdutf_vrgather_u8m1x2(tbl, idx)                                    \
    __riscv_vcreate_v_u8m1_u8m2(                                               \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0),        \
                                 __riscv_vsetvlmax_e8m1()),                    \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1),        \
                                 __riscv_vsetvlmax_e8m1()));

  #define simdutf_vrgather_u8m1x4(tbl, idx)                                    \
    __riscv_vcreate_v_u8m1_u8m4(                                               \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0),        \
                                 __riscv_vsetvlmax_e8m1()),                    \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1),        \
                                 __riscv_vsetvlmax_e8m1()),                    \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2),        \
                                 __riscv_vsetvlmax_e8m1()),                    \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3),        \
                                 __riscv_vsetvlmax_e8m1()));
#else
  // This has worse codegen on gcc
  #define simdutf_vrgather_u8m1x2(tbl, idx)                                    \
    __riscv_vset_v_u8m1_u8m2(                                                  \
        __riscv_vlmul_ext_v_u8m1_u8m2(__riscv_vrgather_vv_u8m1(                \
            tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), \
        1,                                                                     \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1),        \
                                 __riscv_vsetvlmax_e8m1()))

  #define simdutf_vrgather_u8m1x4(tbl, idx)                                    \
    __riscv_vset_v_u8m1_u8m4(                                                  \
        __riscv_vset_v_u8m1_u8m4(                                              \
            __riscv_vset_v_u8m1_u8m4(                                          \
                __riscv_vlmul_ext_v_u8m1_u8m4(__riscv_vrgather_vv_u8m1(        \
                    tbl, __riscv_vget_v_u8m4_u8m1(idx, 0),                     \
                    __riscv_vsetvlmax_e8m1())),                                \
                1,                                                             \
                __riscv_vrgather_vv_u8m1(tbl,                                  \
                                         __riscv_vget_v_u8m4_u8m1(idx, 1),     \
                                         __riscv_vsetvlmax_e8m1())),           \
            2,                                                                 \
            __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2),    \
                                     __riscv_vsetvlmax_e8m1())),               \
        3,                                                                     \
        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3),        \
                                 __riscv_vsetvlmax_e8m1()))
#endif

/* Zvbb adds dedicated support for endianness swaps with vrev8, but if we can't
 * use that, we have to emulate it with the standard V extension.
 * Using LMUL=1 vrgathers could be faster than the srl+macc variant, but that
 * would increase register pressure, and vrgather implementations performance
 * varies a lot. */
enum class simdutf_ByteFlip { NONE, V, ZVBB };

template <simdutf_ByteFlip method>
simdutf_really_inline static uint16_t simdutf_byteflip(uint16_t v) {
  if (method != simdutf_ByteFlip::NONE)
    return (uint16_t)((v * 1u) << 8 | (v * 1u) >> 8);
  return v;
}

#ifdef SIMDUTF_TARGET_ZVBB
SIMDUTF_UNTARGET_REGION
SIMDUTF_TARGET_ZVBB
#endif

template <simdutf_ByteFlip method>
simdutf_really_inline static vuint16m1_t simdutf_byteflip(vuint16m1_t v,
                                                          size_t vl) {
#if SIMDUTF_HAS_ZVBB_INTRINSICS
  if (method == simdutf_ByteFlip::ZVBB)
    return __riscv_vrev8_v_u16m1(v, vl);
#endif
  if (method == simdutf_ByteFlip::V)
    return __riscv_vmacc_vx_u16m1(__riscv_vsrl_vx_u16m1(v, 8, vl), 0x100, v,
                                  vl);
  return v;
}

template <simdutf_ByteFlip method>
simdutf_really_inline static vuint16m2_t simdutf_byteflip(vuint16m2_t v,
                                                          size_t vl) {
#if SIMDUTF_HAS_ZVBB_INTRINSICS
  if (method == simdutf_ByteFlip::ZVBB)
    return __riscv_vrev8_v_u16m2(v, vl);
#endif
  if (method == simdutf_ByteFlip::V)
    return __riscv_vmacc_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 8, vl), 0x100, v,
                                  vl);
  return v;
}

template <simdutf_ByteFlip method>
simdutf_really_inline static vuint16m4_t simdutf_byteflip(vuint16m4_t v,
                                                          size_t vl) {
#if SIMDUTF_HAS_ZVBB_INTRINSICS
  if (method == simdutf_ByteFlip::ZVBB)
    return __riscv_vrev8_v_u16m4(v, vl);
#endif
  if (method == simdutf_ByteFlip::V)
    return __riscv_vmacc_vx_u16m4(__riscv_vsrl_vx_u16m4(v, 8, vl), 0x100, v,
                                  vl);
  return v;
}

template <simdutf_ByteFlip method>
simdutf_really_inline static vuint16m8_t simdutf_byteflip(vuint16m8_t v,
                                                          size_t vl) {
#if SIMDUTF_HAS_ZVBB_INTRINSICS
  if (method == simdutf_ByteFlip::ZVBB)
    return __riscv_vrev8_v_u16m8(v, vl);
#endif
  if (method == simdutf_ByteFlip::V)
    return __riscv_vmacc_vx_u16m8(__riscv_vsrl_vx_u16m8(v, 8, vl), 0x100, v,
                                  vl);
  return v;
}

#ifdef SIMDUTF_TARGET_ZVBB
SIMDUTF_UNTARGET_REGION
SIMDUTF_TARGET_RVV
#endif

#endif //  SIMDUTF_RVV_INTRINSICS_H
/* end file src/simdutf/rvv/intrinsics.h */
/* begin file src/simdutf/rvv/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif

/* end file src/simdutf/rvv/end.h */

#endif // SIMDUTF_IMPLEMENTATION_RVV

#endif // SIMDUTF_RVV_H
/* end file src/simdutf/rvv.h */
/* begin file src/simdutf/lasx.h */
#ifndef SIMDUTF_LASX_H
#define SIMDUTF_LASX_H

#ifdef SIMDUTF_FALLBACK_H
  #error "lasx.h must be included before fallback.h"
#endif


#ifndef SIMDUTF_IMPLEMENTATION_LASX
  #define SIMDUTF_IMPLEMENTATION_LASX (SIMDUTF_IS_LSX)
#endif
#if SIMDUTF_IMPLEMENTATION_LASX && SIMDUTF_IS_LASX
  #define SIMDUTF_CAN_ALWAYS_RUN_LASX 1
#else
  #define SIMDUTF_CAN_ALWAYS_RUN_LASX 0
#endif

#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)

#if SIMDUTF_IMPLEMENTATION_LASX
  #define SIMDUTF_TARGET_LASX SIMDUTF_TARGET_REGION("lasx,lsx")

  // For runtime dispatching to work, we need the lsxintrin to appear
  // before we call SIMDUTF_TARGET_LASX. It is unclear why.
  #include <lsxintrin.h>

namespace simdutf {
/**
 * Implementation for LoongArch ASX.
 */
namespace lasx {} // namespace lasx
} // namespace simdutf

/* begin file src/simdutf/lasx/implementation.h */
#ifndef SIMDUTF_LASX_IMPLEMENTATION_H
#define SIMDUTF_LASX_IMPLEMENTATION_H


namespace simdutf {
namespace lasx {

namespace {
using namespace simdutf;
}

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("lasx", "LOONGARCH ASX",
                                internal::instruction_set::LSX |
                                    internal::instruction_set::LASX) {}
  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override;
};

} // namespace lasx
} // namespace simdutf

#endif // SIMDUTF_LASX_IMPLEMENTATION_H
/* end file src/simdutf/lasx/implementation.h */

/* begin file src/simdutf/lasx/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "lasx"
// #define SIMDUTF_IMPLEMENTATION lasx
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1

#if SIMDUTF_CAN_ALWAYS_RUN_LASX
// nothing needed.
#else
SIMDUTF_TARGET_LASX
#endif
/* end file src/simdutf/lasx/begin.h */

  // Declarations
/* begin file src/simdutf/lasx/intrinsics.h */
#ifndef SIMDUTF_LASX_INTRINSICS_H
#define SIMDUTF_LASX_INTRINSICS_H


// This should be the correct header whether
// you use visual studio or other compilers.
#include <lsxintrin.h>
#include <lasxintrin.h>

#if defined(__loongarch_asx)
  #ifdef __clang__
    #define VREGS_PREFIX "$vr"
    #define XREGS_PREFIX "$xr"
  #else // GCC
    #define VREGS_PREFIX "$f"
    #define XREGS_PREFIX "$f"
  #endif
  #define __ALL_REGS                                                           \
    "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,"  \
    "27,28,29,30,31"
// Convert __m128i to __m256i
static inline __m256i ____m256i(__m128i in) {
  __m256i out = __lasx_xvldi(0);
  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
                   " .ifc %[out], " XREGS_PREFIX "\\i    \n\t"
                   "  .irp j," __ALL_REGS "\n\t"
                   "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
                   "    xvpermi.q $xr\\i, $xr\\j, 0x0  \n\t"
                   "   .endif                           \n\t"
                   "  .endr                             \n\t"
                   " .endif                             \n\t"
                   ".endr                               \n\t"
                   : [out] "+f"(out)
                   : [in] "f"(in));
  return out;
}
// Convert two __m128i to __m256i
static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
  __m256i out;
  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
                   " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
                   "  .irp j," __ALL_REGS "\n\t"
                   "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
                   "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
                   "   .endif                           \n\t"
                   "  .endr                             \n\t"
                   " .endif                             \n\t"
                   ".endr                               \n\t"
                   ".ifnc %[out], %[hi]                 \n\t"
                   ".irp i," __ALL_REGS "\n\t"
                   " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
                   "  .irp j," __ALL_REGS "\n\t"
                   "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
                   "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
                   "   .endif                           \n\t"
                   "  .endr                             \n\t"
                   " .endif                             \n\t"
                   ".endr                               \n\t"
                   ".endif                              \n\t"
                   : [out] "=f"(out), [hi] "+f"(inhi)
                   : [lo] "f"(inlo));
  return out;
}
// Convert __m256i low part to __m128i
static inline __m128i lasx_extracti128_lo(__m256i in) {
  __m128i out;
  __asm__ volatile(".ifnc %[out], %[in]                 \n\t"
                   ".irp i," __ALL_REGS "\n\t"
                   " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
                   "  .irp j," __ALL_REGS "\n\t"
                   "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
                   "    vori.b $vr\\i, $vr\\j, 0        \n\t"
                   "   .endif                           \n\t"
                   "  .endr                             \n\t"
                   " .endif                             \n\t"
                   ".endr                               \n\t"
                   ".endif                              \n\t"
                   : [out] "=f"(out)
                   : [in] "f"(in));
  return out;
}
// Convert __m256i high part to __m128i
static inline __m128i lasx_extracti128_hi(__m256i in) {
  __m128i out;
  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
                   " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
                   "  .irp j," __ALL_REGS "\n\t"
                   "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
                   "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
                   "   .endif                           \n\t"
                   "  .endr                             \n\t"
                   " .endif                             \n\t"
                   ".endr                               \n\t"
                   : [out] "=f"(out)
                   : [in] "f"(in));
  return out;
}
#endif

/*
Encoding of argument for LoongArch64 xvldi instruction.  See:
https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm

1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes

2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes

3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes

4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes

5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes

6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes

7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all
lanes

8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to
all lanes

9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes

10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast
the result as 64-bit elements to all lanes
*/

namespace lasx_vldi {

template <uint16_t v> class const_u16 {
  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);

  constexpr static bool is_case5 = uint16_t(b0) == v;
  constexpr static bool is_case6 = (uint16_t(b1) << 8) == v;
  constexpr static bool is_case9 = (b0 == b1);
  constexpr static bool is_case10 =
      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00));

public:
  constexpr static uint16_t operation = is_case5    ? 0b10100
                                        : is_case6  ? 0b10101
                                        : is_case9  ? 0b11000
                                        : is_case10 ? 0x11001
                                                    : 0xffff;

  constexpr static uint16_t byte =
      is_case5    ? b0
      : is_case6  ? b1
      : is_case9  ? b0
      : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00))
                  : 0xffff;

  constexpr static int value = int((operation << 8) | byte) - 8192;
  constexpr static bool valid = operation != 0xffff;
};

template <uint32_t v> class const_u32 {
  constexpr static const uint8_t b0 = (v & 0xff);
  constexpr static const uint8_t b1 = ((v >> 8) & 0xff);
  constexpr static const uint8_t b2 = ((v >> 16) & 0xff);
  constexpr static const uint8_t b3 = ((v >> 24) & 0xff);

  constexpr static bool is_case1 = (uint32_t(b0) == v);
  constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v);
  constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v);
  constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v);
  constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0);
  constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0);
  constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff);
  constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff);
  constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3);
  constexpr static bool is_case10 =
      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00));

public:
  constexpr static uint16_t operation = is_case1    ? 0b10000
                                        : is_case2  ? 0b10001
                                        : is_case3  ? 0b10010
                                        : is_case4  ? 0b10011
                                        : is_case5  ? 0b10100
                                        : is_case6  ? 0b10101
                                        : is_case7  ? 0b10110
                                        : is_case8  ? 0b10111
                                        : is_case9  ? 0b11000
                                        : is_case10 ? 0b11001
                                                    : 0xffff;

  constexpr static uint16_t byte =
      is_case1    ? b0
      : is_case2  ? b1
      : is_case3  ? b2
      : is_case4  ? b3
      : is_case5  ? b0
      : is_case6  ? b1
      : is_case7  ? b1
      : is_case8  ? b2
      : is_case9  ? b0
      : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) |
                     (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00))
                  : 0xffff;

  constexpr static int value = int((operation << 8) | byte) - 8192;
  constexpr static bool valid = operation != 0xffff;
};

template <uint64_t v> class const_u64 {
  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
  constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff);
  constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff);
  constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff);
  constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff);
  constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff);
  constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff);

  constexpr static bool is_case10 =
      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) &&
      ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) &&
      ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00));

public:
  constexpr static bool is_32bit =
      ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value;
  constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation;
  constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte;

  constexpr static uint16_t operation = is_32bit    ? op_32bit
                                        : is_case10 ? 0x11001
                                                    : 0xffff;

  constexpr static uint16_t byte =
      is_32bit ? byte_32bit
      : is_case10
          ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) |
             (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) |
             (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00))
          : 0xffff;

  constexpr static int value = int((operation << 8) | byte) - 8192;
  constexpr static bool valid = operation != 0xffff;
};

} // namespace lasx_vldi

// Uncomment when running under QEMU affected
// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865
// Versions <= 9.2.2 are affected, likely anything newer is correct.
#ifndef QEMU_VLDI_BUG
// #define QEMU_VLDI_BUG 1
#endif

#ifdef QEMU_VLDI_BUG
  #define lasx_splat_u16(v) __lasx_xvreplgr2vr_h(v)
  #define lasx_splat_u32(v) __lasx_xvreplgr2vr_w(v)
#else
template <uint16_t x> constexpr __m256i lasx_splat_u16_aux() {
  constexpr bool is_imm10 = (int16_t(x) < 512) && (int16_t(x) > -512);
  constexpr uint16_t imm10 = is_imm10 ? x : 0;
  constexpr bool is_vldi = lasx_vldi::const_u16<x>::valid;
  constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u16<x>::value : 0;

  return is_imm10  ? __lasx_xvrepli_h(int16_t(imm10))
         : is_vldi ? __lasx_xvldi(vldi_imm)
                   : __lasx_xvreplgr2vr_h(x);
}

template <uint32_t x> constexpr __m256i lasx_splat_u32_aux() {
  constexpr bool is_imm10 = (int32_t(x) < 512) && (int32_t(x) > -512);
  constexpr uint32_t imm10 = is_imm10 ? x : 0;
  constexpr bool is_vldi = lasx_vldi::const_u32<x>::valid;
  constexpr int vldi_imm = is_vldi ? lasx_vldi::const_u32<x>::value : 0;

  return is_imm10  ? __lasx_xvrepli_w(int32_t(imm10))
         : is_vldi ? __lasx_xvldi(vldi_imm)
                   : __lasx_xvreplgr2vr_w(x);
}

  #define lasx_splat_u16(v) lasx_splat_u16_aux<(v)>()
  #define lasx_splat_u32(v) lasx_splat_u32_aux<(v)>()
#endif // QEMU_VLDI_BUG

#ifndef lsx_splat_u16
  #ifdef QEMU_VLDI_BUG
    #define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v)
    #define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v)
  #else
namespace {
template <uint16_t x> constexpr __m128i lsx_splat_u16_aux() {
  return ((int16_t(x) < 512) && (int16_t(x) > -512))
             ? __lsx_vrepli_h(
                   ((int16_t(x) < 512) && (int16_t(x) > -512)) ? int16_t(x) : 0)
             : (lasx_vldi::const_u16<x>::valid
                    ? __lsx_vldi(lasx_vldi::const_u16<x>::valid
                                     ? lasx_vldi::const_u16<x>::value
                                     : 0)
                    : __lsx_vreplgr2vr_h(x));
}

template <uint32_t x> constexpr __m128i lsx_splat_u32_aux() {
  return ((int32_t(x) < 512) && (int32_t(x) > -512))
             ? __lsx_vrepli_w(
                   ((int32_t(x) < 512) && (int32_t(x) > -512)) ? int32_t(x) : 0)
             : (lasx_vldi::const_u32<x>::valid
                    ? __lsx_vldi(lasx_vldi::const_u32<x>::valid
                                     ? lasx_vldi::const_u32<x>::value
                                     : 0)
                    : __lsx_vreplgr2vr_w(x));
}
} // namespace
    #define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>()
    #define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>()
  #endif // QEMU_VLDI_BUG
#endif   // lsx_splat_u16

#endif //  SIMDUTF_LASX_INTRINSICS_H
/* end file src/simdutf/lasx/intrinsics.h */
/* begin file src/simdutf/lasx/bitmanipulation.h */
#ifndef SIMDUTF_LASX_BITMANIPULATION_H
#define SIMDUTF_LASX_BITMANIPULATION_H

#include <limits>

namespace simdutf {
namespace lasx {
namespace {

simdutf_really_inline int count_ones(uint64_t input_num) {
  return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
}

#if SIMDUTF_NEED_TRAILING_ZEROES
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
  return __builtin_ctzll(input_num);
}
#endif

} // unnamed namespace
} // namespace lasx
} // namespace simdutf

#endif // SIMDUTF_LASX_BITMANIPULATION_H
/* end file src/simdutf/lasx/bitmanipulation.h */
/* begin file src/simdutf/lasx/simd.h */
#ifndef SIMDUTF_LASX_SIMD_H
#define SIMDUTF_LASX_SIMD_H


namespace simdutf {
namespace lasx {
namespace {
namespace simd {

__attribute__((aligned(32))) static const uint8_t prev_shuf_table[32][32] = {
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
    {0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
     31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
    {0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
     30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
    {0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
     29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
    {0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
     28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
    {0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
     27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
    {0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
     26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
    {0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8,
     25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8},
    {0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7,
     24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6,
     23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5,
     22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4,
     21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,
     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2,
     19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1,
     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1},
    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0},
    {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
     15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
     14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
     13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
     12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
     11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
     10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
    {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
     9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0},
    {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
     8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0},
    {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
     7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0},
    {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
     6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0},
    {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
     5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0},
    {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
     4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0},
    {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
     3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0},
    {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
     2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0},
    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0},
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
};

__attribute__((aligned(32))) static const uint8_t bitsel_mask_table[32][32] = {
    {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
    {0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0},
    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0}};

// Forward-declared so they can be used by splat and friends.
template <typename Child> struct base {
  __m256i value;

  // Zero constructor
  simdutf_really_inline base() : value{__m256i()} {}

  // Conversion from SIMD register
  simdutf_really_inline base(const __m256i _value) : value(_value) {}
  // Conversion to SIMD register
  simdutf_really_inline operator const __m256i &() const { return this->value; }
  simdutf_really_inline operator __m256i &() { return this->value; }
  template <endianness big_endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    if (big_endian) {
      __m256i zero = __lasx_xvldi(0);
      __m256i in8 = __lasx_xvpermi_d(this->value, 0b11011000);
      __m256i inlow = __lasx_xvilvl_b(in8, zero);
      __m256i inhigh = __lasx_xvilvh_b(in8, zero);
      __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(ptr), 0);
      __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(ptr), 32);
    } else {
      __m256i inlow = __lasx_vext2xv_hu_bu(this->value);
      __m256i inhigh = __lasx_vext2xv_hu_bu(
          __lasx_xvpermi_q(this->value, this->value, 0b00000001));
      __lasx_xvst(inlow, reinterpret_cast<__m256i *>(ptr), 0);
      __lasx_xvst(inhigh, reinterpret_cast<__m256i *>(ptr), 32);
    }
  }
  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    __m256i in32_0 = __lasx_vext2xv_wu_bu(this->value);
    __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(ptr), 0);

    __m256i in8_1 = __lasx_xvpermi_d(this->value, 0b00000001);
    __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
    __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(ptr), 32);

    __m256i in8_2 = __lasx_xvpermi_d(this->value, 0b00000010);
    __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
    __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(ptr), 64);

    __m256i in8_3 = __lasx_xvpermi_d(this->value, 0b00000011);
    __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
    __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(ptr), 96);
  }
  // Bit operations
  simdutf_really_inline Child operator|(const Child other) const {
    return __lasx_xvor_v(this->value, other);
  }
  simdutf_really_inline Child operator&(const Child other) const {
    return __lasx_xvand_v(this->value, other);
  }
  simdutf_really_inline Child operator^(const Child other) const {
    return __lasx_xvxor_v(this->value, other);
  }
  simdutf_really_inline Child &operator|=(const Child other) {
    auto this_cast = static_cast<Child *>(this);
    *this_cast = *this_cast | other;
    return *this_cast;
  }
};

template <typename T> struct simd8;

template <typename T, typename Mask = simd8<bool>>
struct base8 : base<simd8<T>> {
  simdutf_really_inline base8() : base<simd8<T>>() {}
  simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
                                               const simd8<T> rhs) {
    return __lasx_xvseq_b(lhs, rhs);
  }

  static const int SIZE = sizeof(base<T>::value);

  template <unsigned N = 1>
  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
    static_assert(N <= 16, "unsupported shift value");

    if (!N)
      return this->value;

    __m256i zero = __lasx_xvldi(0);
    __m256i result, shuf;
    if (N < 16) {
      shuf = __lasx_xvld(prev_shuf_table[N], 0);

      result = __lasx_xvshuf_b(
          __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
          shuf);
      __m256i srl_prev = __lasx_xvbsrl_v(
          __lasx_xvpermi_q(zero, prev_chunk.value, 0b00110001), (16 - N));
      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
      result = __lasx_xvbitsel_v(result, srl_prev, mask);

      return result;
    } else if (N == 16) {
      return __lasx_xvpermi_q(this->value, prev_chunk.value, 0b00100001);
    }
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base8<bool> {
  static simdutf_really_inline simd8<bool> splat(bool _value) {
    return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value)));
  }

  simdutf_really_inline simd8() : base8() {}
  simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}

  simdutf_really_inline uint32_t to_bitmask() const {
    __m256i mask = __lasx_xvmsknz_b(this->value);
    uint32_t mask0 = __lasx_xvpickve2gr_wu(mask, 0);
    uint32_t mask1 = __lasx_xvpickve2gr_wu(mask, 4);
    return (mask0 | (mask1 << 16));
  }
  simdutf_really_inline bool any() const {
    if (__lasx_xbz_b(this->value))
      return false;
    return true;
  }
  simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
};

template <typename T> struct base8_numeric : base8<T> {
  static simdutf_really_inline simd8<T> splat(T _value) {
    return __lasx_xvreplgr2vr_b(_value);
  }
  static simdutf_really_inline simd8<T> zero() { return __lasx_xvldi(0); }
  static simdutf_really_inline simd8<T> load(const T values[32]) {
    return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
  }
  // Repeat 16 values as many times as necessary (usually for lookup tables)
  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
                                                  T v5, T v6, T v7, T v8, T v9,
                                                  T v10, T v11, T v12, T v13,
                                                  T v14, T v15) {
    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
                    v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
                    v12, v13, v14, v15);
  }

  simdutf_really_inline base8_numeric() : base8<T>() {}
  simdutf_really_inline base8_numeric(const __m256i _value)
      : base8<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[32]) const {
    return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }

  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
  // for out of range values)
  template <typename L>
  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
    __m256i origin = __lasx_xvand_v(this->value, __lasx_xvldi(0x1f));
    return __lasx_xvshuf_b(__lasx_xvldi(0), lookup_table, origin);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
            L replace5, L replace6, L replace7, L replace8, L replace9,
            L replace10, L replace11, L replace12, L replace13, L replace14,
            L replace15) const {
    return lookup_16(simd8<L>::repeat_16(
        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
        replace7, replace8, replace9, replace10, replace11, replace12,
        replace13, replace14, replace15));
  }
};

// Signed bytes
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
  simdutf_really_inline simd8(const __m256i _value)
      : base8_numeric<int8_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
  simdutf_really_inline operator simd8<uint8_t>() const;
  simdutf_really_inline bool is_ascii() const {
    __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
    if (__lasx_xbnz_v(ascii_mask))
      return false;
    return true;
  }
  // Order-sensitive comparisons
  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
    return __lasx_xvslt_b(other, this->value);
  }
  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
    return __lasx_xvslt_b(this->value, other);
  }
};

// Unsigned bytes
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
  simdutf_really_inline simd8(const __m256i _value)
      : base8_numeric<uint8_t>(_value) {}
  // Splat constructor
  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
  // Member-by-member initialization
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
        uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
        uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
        uint8_t v31)
      : simd8((__m256i)v32u8{v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,
                             v8,  v9,  v10, v11, v12, v13, v14, v15,
                             v16, v17, v18, v19, v20, v21, v22, v23,
                             v24, v25, v26, v27, v28, v29, v30, v31}) {}

  // Saturated math
  simdutf_really_inline simd8<uint8_t>
  saturating_sub(const simd8<uint8_t> other) const {
    return __lasx_xvssub_bu(this->value, other);
  }

  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
  simdutf_really_inline simd8<uint8_t>
  gt_bits(const simd8<uint8_t> other) const {
    return this->saturating_sub(other);
  }
  simdutf_really_inline simd8<bool>
  operator>=(const simd8<uint8_t> other) const {
    return __lasx_xvsle_bu(other, *this);
  }
  simdutf_really_inline simd8<bool>
  operator>(const simd8<uint8_t> other) const {
    return __lasx_xvslt_bu(other, *this);
  }
  simdutf_really_inline simd8 &operator-=(const simd8<uint8_t> other) {
    value = __lasx_xvsub_b(value, other.value);
    return *this;
  }

  // Bit-specific operations
  simdutf_really_inline bool is_ascii() const {
    __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
    if (__lasx_xbnz_v(ascii_mask))
      return false;
    return true;
  }
  simdutf_really_inline bool any_bits_set_anywhere() const {
    if (__lasx_xbnz_v(this->value))
      return true;
    return false;
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
    return __lasx_xvsrli_b(this->value, N);
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
    return __lasx_xvslli_b(this->value, N);
  }

  simdutf_really_inline uint64_t sum_bytes() const {
    const auto sum_u16 = __lasx_xvhaddw_hu_bu(value, value);
    const auto sum_u32 = __lasx_xvhaddw_wu_hu(sum_u16, sum_u16);
    const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32);

    return uint64_t(__lasx_xvpickve2gr_du(sum_u64, 0)) +
           uint64_t(__lasx_xvpickve2gr_du(sum_u64, 1)) +
           uint64_t(__lasx_xvpickve2gr_du(sum_u64, 2)) +
           uint64_t(__lasx_xvpickve2gr_du(sum_u64, 3));
  }
};
simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
  return this->value;
}

template <typename T> struct simd8x64 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
  static_assert(NUM_CHUNKS == 2,
                "LASX kernel should use two registers per 64-byte block.");
  simd8<T> chunks[NUM_CHUNKS];

  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
  simd8x64<T> &
  operator=(const simd8<T> other) = delete; // no assignment allowed
  simd8x64() = delete;                      // no default constructor allowed

  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
      : chunks{chunk0, chunk1} {}
  simdutf_really_inline simd8x64(const T *ptr)
      : chunks{simd8<T>::load(ptr),
               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r_hi = this->chunks[1].to_bitmask();
    return r_lo | (r_hi << 32);
  }

  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
    this->chunks[0] |= other.chunks[0];
    this->chunks[1] |= other.chunks[1];
    return *this;
  }

  simdutf_really_inline simd8<T> reduce_or() const {
    return this->chunks[0] | this->chunks[1];
  }

  simdutf_really_inline bool is_ascii() const {
    return this->reduce_or().is_ascii();
  }

  template <endianness endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 0);
    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 1);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
    this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
  }

  simdutf_really_inline uint64_t lt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask)
        .to_bitmask();
  }

  simdutf_really_inline uint64_t gt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
    return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
                          (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
        .to_bitmask();
  }
}; // struct simd8x64<T>

/* begin file src/simdutf/lasx/simd16-inl.h */
template <typename T> struct simd16;

template <typename T, typename Mask = simd16<bool>>
struct base16 : base<simd16<T>> {
  using bitmask_type = uint32_t;

  simdutf_really_inline base16() : base<simd16<T>>() {}
  simdutf_really_inline base16(const __m256i _value)
      : base<simd16<T>>(_value) {}
  template <typename Pointer>
  simdutf_really_inline base16(const Pointer *ptr)
      : base16(__lasx_xvld(reinterpret_cast<const __m256i *>(ptr), 0)) {}

  /// the size of vector in bytes
  static const int SIZE = sizeof(base<simd16<T>>::value);

  /// the number of elements of type T a vector can hold
  static const int ELEMENTS = SIZE / sizeof(T);
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd16<bool> : base16<bool> {
  static simdutf_really_inline simd16<bool> splat(bool _value) {
    return __lasx_xvreplgr2vr_h(uint16_t(-(!!_value)));
  }

  simdutf_really_inline simd16() : base16() {}
  simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
  // Splat constructor
  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}

  simdutf_really_inline bitmask_type to_bitmask() const {
    __m256i mask = __lasx_xvmsknz_b(this->value);
    bitmask_type mask0 = __lasx_xvpickve2gr_wu(mask, 0);
    bitmask_type mask1 = __lasx_xvpickve2gr_wu(mask, 4);
    return (mask0 | (mask1 << 16));
  }
  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }

  simdutf_really_inline bool is_zero() const {
    return __lasx_xbz_v(this->value);
  }

  template <unsigned N> simdutf_really_inline simd16 byte_right_shift() const {
    const auto t0 = __lasx_xvbsrl_v(this->value, N);
    const auto t1 = __lasx_xvpermi_q(this->value, __lasx_xvldi(0), 0b00000011);
    const auto t2 = __lasx_xvbsll_v(t1, 16 - N);
    const auto t3 = __lasx_xvor_v(t0, t2);
    return t3;
  }

  simdutf_really_inline uint16_t first() const {
    return uint16_t(__lasx_xvpickve2gr_w(value, 0));
  }
};

template <typename T> struct base16_numeric : base16<T> {
  static simdutf_really_inline simd16<T> splat(T _value) {
    return __lasx_xvreplgr2vr_h((uint16_t)_value);
  }
  static simdutf_really_inline simd16<T> zero() { return __lasx_xvldi(0); }
  template <typename Pointer>
  static simdutf_really_inline simd16<T> load(const Pointer values) {
    return __lasx_xvld(values, 0);
  }

  simdutf_really_inline base16_numeric() : base16<T>() {}
  simdutf_really_inline base16_numeric(const __m256i _value)
      : base16<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[8]) const {
    return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
};

// Unsigned code units
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
  simdutf_really_inline simd16(const __m256i _value)
      : base16_numeric<uint16_t>(_value) {}

  // Splat constructor
  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}

  // Array constructor
  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}

  // Order-specific operations
  simdutf_really_inline simd16 &operator+=(const simd16 other) {
    value = __lasx_xvadd_h(value, other.value);
    return *this;
  }

  // Change the endianness
  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
    return __lasx_xvshuf4i_b(this->value, 0b10110001);
  }

  template <unsigned N>
  static simdutf_really_inline simd8<uint8_t>
  pack_shifted_right(const simd16<uint16_t> &v0, const simd16<uint16_t> &v1) {
    return __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(v1.value, v0.value, N),
                            0b11011000);
  }

  // Pack with the unsigned saturation of two uint16_t code units into single
  // uint8_t vector
  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
                                                   const simd16<uint16_t> &v1) {

    return pack_shifted_right<0>(v0, v1);
  }

  simdutf_really_inline uint64_t sum() const {
    const auto sum_u32 = __lasx_xvhaddw_wu_hu(value, value);
    const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32);

    return uint64_t(__lasx_xvpickve2gr_du(sum_u64, 0)) +
           uint64_t(__lasx_xvpickve2gr_du(sum_u64, 1)) +
           uint64_t(__lasx_xvpickve2gr_du(sum_u64, 2)) +
           uint64_t(__lasx_xvpickve2gr_du(sum_u64, 3));
  }

  template <unsigned N> simdutf_really_inline simd16 byte_right_shift() const {
    return __lasx_xvbsrl_v(this->value, N);
  }
};

simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> a,
                                             const simd16<uint16_t> b) {
  return __lasx_xvslt_hu(a.value, b.value);
}

simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> a,
                                             const simd16<uint16_t> b) {
  return __lasx_xvslt_hu(b.value, a.value);
}

simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> a,
                                              const simd16<uint16_t> b) {
  return __lasx_xvsle_hu(a.value, b.value);
}

simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> a,
                                              const simd16<uint16_t> b) {
  return __lasx_xvsle_hu(b.value, a.value);
}

template <typename T> struct simd16x32 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
  static_assert(NUM_CHUNKS == 2,
                "LASX kernel should use two registers per 64-byte block.");
  simd16<T> chunks[NUM_CHUNKS];

  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
  simd16x32<T> &
  operator=(const simd16<T> other) = delete; // no assignment allowed
  simd16x32() = delete;                      // no default constructor allowed

  simdutf_really_inline simd16x32(const simd16<T> chunk0,
                                  const simd16<T> chunk1)
      : chunks{chunk0, chunk1} {}
  simdutf_really_inline simd16x32(const T *ptr)
      : chunks{simd16<T>::load(ptr),
               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
  }

  simdutf_really_inline void swap_bytes() {
    this->chunks[0] = this->chunks[0].swap_bytes();
    this->chunks[1] = this->chunks[1].swap_bytes();
  }
  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r_hi = this->chunks[1].to_bitmask();
    return r_lo | (r_hi << 32);
  }
  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t lteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
        .to_bitmask();
  }
}; // struct simd16x32<T>

simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
                                           const simd16<uint16_t> b) {
  return __lasx_xvmin_hu(a.value, b.value);
}

simdutf_really_inline simd16<bool> operator==(const simd16<uint16_t> a,
                                              uint16_t b) {
  const auto bv = __lasx_xvreplgr2vr_h(b);
  return __lasx_xvseq_h(a.value, bv);
}

simdutf_really_inline simd16<uint16_t> as_vector_u16(const simd16<bool> x) {
  return x.value;
}

simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> a,
                                                 uint16_t b) {
  const auto bv = __lasx_xvreplgr2vr_h(b);
  return __lasx_xvand_v(a.value, bv);
}

simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> a,
                                                 const simd16<uint16_t> b) {
  return __lasx_xvand_v(a.value, b.value);
}

simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
                                                 uint16_t b) {
  const auto bv = __lasx_xvreplgr2vr_h(b);
  return __lasx_xvxor_v(a.value, bv);
}

simdutf_really_inline simd16<bool> operator^(const simd16<bool> a,
                                             const simd16<bool> b) {
  return __lasx_xvxor_v(a.value, b.value);
}
/* end file src/simdutf/lasx/simd16-inl.h */
/* begin file src/simdutf/lasx/simd32-inl.h */
template <typename T> struct simd32;

template <> struct simd32<uint32_t> {
  __m256i value;
  static const int SIZE = sizeof(value);
  static const int ELEMENTS = SIZE / sizeof(uint32_t);

  // constructors
  simdutf_really_inline simd32(__m256i v) : value(v) {}

  template <typename Ptr>
  simdutf_really_inline simd32(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {}

  // in-place operators
  simdutf_really_inline simd32 &operator-=(const simd32 other) {
    value = __lasx_xvsub_w(value, other.value);
    return *this;
  }

  // members
  simdutf_really_inline uint64_t sum() const {
    const auto odd = __lasx_xvsrli_d(value, 32);
    const auto even = __lasx_xvand_v(value, __lasx_xvreplgr2vr_d(0xffffffff));

    const auto sum64 = __lasx_xvadd_d(odd, even);

    return uint64_t(__lasx_xvpickve2gr_du(sum64, 0)) +
           uint64_t(__lasx_xvpickve2gr_du(sum64, 1)) +
           uint64_t(__lasx_xvpickve2gr_du(sum64, 2)) +
           uint64_t(__lasx_xvpickve2gr_du(sum64, 3));
  }

  // static members
  static simdutf_really_inline simd32<uint32_t> splat(uint32_t x) {
    return __lasx_xvreplgr2vr_w(x);
  }

  static simdutf_really_inline simd32<uint32_t> zero() {
    return __lasx_xvrepli_w(0);
  }
};

// ------------------------------------------------------------

template <> struct simd32<bool> {
  __m256i value;
  static const int SIZE = sizeof(value);

  // constructors
  simdutf_really_inline simd32(__m256i v) : value(v) {}
};

// ------------------------------------------------------------

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return __lasx_xvor_v(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator<(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return __lasx_xvslt_wu(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return __lasx_xvslt_wu(b.value, a.value);
}

// ------------------------------------------------------------

simdutf_really_inline simd32<uint32_t> as_vector_u32(const simd32<bool> v) {
  return v.value;
}
/* end file src/simdutf/lasx/simd32-inl.h */
/* begin file src/simdutf/lasx/simd64-inl.h */
template <typename T> struct simd64;

template <> struct simd64<uint64_t> {
  __m256i value;
  static const int SIZE = sizeof(value);
  static const int ELEMENTS = SIZE / sizeof(uint64_t);

  // constructors
  simdutf_really_inline simd64(__m256i v) : value(v) {}

  template <typename Ptr>
  simdutf_really_inline simd64(Ptr *ptr) : value(__lasx_xvld(ptr, 0)) {}

  // in-place operators
  simdutf_really_inline simd64 &operator+=(const simd64 other) {
    value = __lasx_xvadd_d(value, other.value);
    return *this;
  }

  // members
  simdutf_really_inline uint64_t sum() const {
    return uint64_t(__lasx_xvpickve2gr_du(value, 0)) +
           uint64_t(__lasx_xvpickve2gr_du(value, 1)) +
           uint64_t(__lasx_xvpickve2gr_du(value, 2)) +
           uint64_t(__lasx_xvpickve2gr_du(value, 3));
  }

  // static members
  static simdutf_really_inline simd64<uint64_t> zero() {
    return __lasx_xvrepli_d(0);
  }
};

// ------------------------------------------------------------

template <> struct simd64<bool> {
  __m256i value;
  static const int SIZE = sizeof(value);

  // constructors
  simdutf_really_inline simd64(__m256i v) : value(v) {}
};

// ------------------------------------------------------------

simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
  const auto sum_u16 = __lasx_xvhaddw_hu_bu(v, v);
  const auto sum_u32 = __lasx_xvhaddw_wu_hu(sum_u16, sum_u16);
  const auto sum_u64 = __lasx_xvhaddw_du_wu(sum_u32, sum_u32);

  return simd64<uint64_t>(sum_u64);
}
/* end file src/simdutf/lasx/simd64-inl.h */

} // namespace simd
} // unnamed namespace
} // namespace lasx
} // namespace simdutf

#endif // SIMDUTF_LASX_SIMD_H
/* end file src/simdutf/lasx/simd.h */

/* begin file src/simdutf/lasx/end.h */
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP

#if SIMDUTF_CAN_ALWAYS_RUN_LASX
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif
/* end file src/simdutf/lasx/end.h */

#endif // SIMDUTF_IMPLEMENTATION_LASX

#endif // SIMDUTF_LASX_H
/* end file src/simdutf/lasx.h */
/* begin file src/simdutf/lsx.h */
#ifndef SIMDUTF_LSX_H
#define SIMDUTF_LSX_H

#ifdef SIMDUTF_FALLBACK_H
  #error "lsx.h must be included before fallback.h"
#endif

#ifndef SIMDUTF_CAN_ALWAYS_RUN_LASX
  #error "lsx.h must be included after lasx.h"
#endif


#ifndef SIMDUTF_IMPLEMENTATION_LSX
  #if SIMDUTF_CAN_ALWAYS_RUN_LASX
    #define SIMDUTF_IMPLEMENTATION_LSX 0
  #else
    #define SIMDUTF_IMPLEMENTATION_LSX (SIMDUTF_IS_LSX)
  #endif
#endif
#if SIMDUTF_IMPLEMENTATION_LSX && SIMDUTF_IS_LSX
  #define SIMDUTF_CAN_ALWAYS_RUN_LSX 1
#else
  #define SIMDUTF_CAN_ALWAYS_RUN_LSX 0
#endif

#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)

#if SIMDUTF_IMPLEMENTATION_LSX

namespace simdutf {
/**
 * Implementation for LoongArch SX.
 */
namespace lsx {} // namespace lsx
} // namespace simdutf

/* begin file src/simdutf/lsx/implementation.h */
#ifndef SIMDUTF_LSX_IMPLEMENTATION_H
#define SIMDUTF_LSX_IMPLEMENTATION_H


namespace simdutf {
namespace lsx {

namespace {
using namespace simdutf;
}

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("lsx", "LOONGARCH SX",
                                internal::instruction_set::LSX) {}
  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;
  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override;
  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override;
};

} // namespace lsx
} // namespace simdutf

#endif // SIMDUTF_LSX_IMPLEMENTATION_H
/* end file src/simdutf/lsx/implementation.h */

/* begin file src/simdutf/lsx/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "lsx"
// #define SIMDUTF_IMPLEMENTATION lsx
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
/* end file src/simdutf/lsx/begin.h */

  // Declarations
/* begin file src/simdutf/lsx/intrinsics.h */
#ifndef SIMDUTF_LSX_INTRINSICS_H
#define SIMDUTF_LSX_INTRINSICS_H


// This should be the correct header whether
// you use visual studio or other compilers.
#include <lsxintrin.h>

/*
Encoding of argument for LoongArch64 xvldi instruction.  See:
https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/#__m256i-__lasx_xvldi-imm_n1024_1023-imm

1: imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes

2: imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes

3: imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes

4: imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes

5: imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes

6: imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes

7: imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all
lanes

8: imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to
all lanes

9: imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes

10: imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast
the result as 64-bit elements to all lanes
*/

namespace vldi {

template <uint16_t v> class const_u16 {
  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);

  constexpr static bool is_case5 = uint16_t(b0) == v;
  constexpr static bool is_case6 = (uint16_t(b1) << 8) == v;
  constexpr static bool is_case9 = (b0 == b1);
  constexpr static bool is_case10 =
      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00));

public:
  constexpr static uint16_t operation = is_case5    ? 0b10100
                                        : is_case6  ? 0b10101
                                        : is_case9  ? 0b11000
                                        : is_case10 ? 0x11001
                                                    : 0xffff;

  constexpr static uint16_t byte =
      is_case5    ? b0
      : is_case6  ? b1
      : is_case9  ? b0
      : is_case10 ? ((b0 ? 0x55 : 0x00) | (b1 ? 0xaa : 0x00))
                  : 0xffff;

  constexpr static int value = int((operation << 8) | byte) - 8192;
  constexpr static bool valid = operation != 0xffff;
};

template <uint32_t v> class const_u32 {
  constexpr static const uint8_t b0 = (v & 0xff);
  constexpr static const uint8_t b1 = ((v >> 8) & 0xff);
  constexpr static const uint8_t b2 = ((v >> 16) & 0xff);
  constexpr static const uint8_t b3 = ((v >> 24) & 0xff);

  constexpr static bool is_case1 = (uint32_t(b0) == v);
  constexpr static bool is_case2 = ((uint32_t(b1) << 8) == v);
  constexpr static bool is_case3 = ((uint32_t(b2) << 16) == v);
  constexpr static bool is_case4 = ((uint32_t(b3) << 24) == v);
  constexpr static bool is_case5 = (b0 == b2) && (b1 == 0) && (b3 == 0);
  constexpr static bool is_case6 = (b1 == b3) && (b0 == 0) && (b2 == 0);
  constexpr static bool is_case7 = (b3 == 0) && (b2 == 0) && (b0 == 0xff);
  constexpr static bool is_case8 = (b3 == 0) && (b1 == 0xff) && (b0 == 0xff);
  constexpr static bool is_case9 = (b0 == b1) && (b0 == b2) && (b0 == b3);
  constexpr static bool is_case10 =
      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00));

public:
  constexpr static uint16_t operation = is_case1    ? 0b10000
                                        : is_case2  ? 0b10001
                                        : is_case3  ? 0b10010
                                        : is_case4  ? 0b10011
                                        : is_case5  ? 0b10100
                                        : is_case6  ? 0b10101
                                        : is_case7  ? 0b10110
                                        : is_case8  ? 0b10111
                                        : is_case9  ? 0b11000
                                        : is_case10 ? 0b11001
                                                    : 0xffff;

  constexpr static uint16_t byte =
      is_case1    ? b0
      : is_case2  ? b1
      : is_case3  ? b2
      : is_case4  ? b3
      : is_case5  ? b0
      : is_case6  ? b1
      : is_case7  ? b1
      : is_case8  ? b2
      : is_case9  ? b0
      : is_case10 ? ((b0 ? 0x11 : 0x00) | (b1 ? 0x22 : 0x00) |
                     (b2 ? 0x44 : 0x00) | (b3 ? 0x88 : 0x00))
                  : 0xffff;

  constexpr static int value = int((operation << 8) | byte) - 8192;
  constexpr static bool valid = operation != 0xffff;
};

template <uint64_t v> class const_u64 {
  constexpr static const uint8_t b0 = ((v >> 0 * 8) & 0xff);
  constexpr static const uint8_t b1 = ((v >> 1 * 8) & 0xff);
  constexpr static const uint8_t b2 = ((v >> 2 * 8) & 0xff);
  constexpr static const uint8_t b3 = ((v >> 3 * 8) & 0xff);
  constexpr static const uint8_t b4 = ((v >> 4 * 8) & 0xff);
  constexpr static const uint8_t b5 = ((v >> 5 * 8) & 0xff);
  constexpr static const uint8_t b6 = ((v >> 6 * 8) & 0xff);
  constexpr static const uint8_t b7 = ((v >> 7 * 8) & 0xff);

  constexpr static bool is_case10 =
      ((b0 == 0xff) || (b0 == 0x00)) && ((b1 == 0xff) || (b1 == 0x00)) &&
      ((b2 == 0xff) || (b2 == 0x00)) && ((b3 == 0xff) || (b3 == 0x00)) &&
      ((b4 == 0xff) || (b4 == 0x00)) && ((b5 == 0xff) || (b5 == 0x00)) &&
      ((b6 == 0xff) || (b6 == 0x00)) && ((b7 == 0xff) || (b7 == 0x00));

public:
  constexpr static bool is_32bit =
      ((v & 0xffffffff) == (v >> 32)) && const_u32<(v >> 32)>::value;
  constexpr static uint8_t op_32bit = const_u32<(v >> 32)>::operation;
  constexpr static uint8_t byte_32bit = const_u32<(v >> 32)>::byte;

  constexpr static uint16_t operation = is_32bit    ? op_32bit
                                        : is_case10 ? 0x11001
                                                    : 0xffff;

  constexpr static uint16_t byte =
      is_32bit ? byte_32bit
      : is_case10
          ? ((b0 ? 0x01 : 0x00) | (b1 ? 0x02 : 0x00) | (b2 ? 0x04 : 0x00) |
             (b3 ? 0x08 : 0x00) | (b4 ? 0x10 : 0x00) | (b5 ? 0x20 : 0x00) |
             (b6 ? 0x40 : 0x00) | (b7 ? 0x80 : 0x00))
          : 0xffff;

  constexpr static int value = int((operation << 8) | byte) - 8192;
  constexpr static bool valid = operation != 0xffff;
};
} // namespace vldi

// Uncomment when running under QEMU affected
// by bug https://gitlab.com/qemu-project/qemu/-/issues/2865
// Versions <= 9.2.2 are affected, likely anything newer is correct.
#ifndef QEMU_VLDI_BUG
// #define QEMU_VLDI_BUG 1
#endif

#ifndef lsx_splat_u16
  #ifdef QEMU_VLDI_BUG
    #define lsx_splat_u16(v) __lsx_vreplgr2vr_h(v)
    #define lsx_splat_u32(v) __lsx_vreplgr2vr_w(v)
  #else
namespace {
template <uint16_t x> constexpr __m128i lsx_splat_u16_aux() {
  return ((int16_t(x) < 512) && (int16_t(x) > -512))
             ? __lsx_vrepli_h(
                   ((int16_t(x) < 512) && (int16_t(x) > -512)) ? int16_t(x) : 0)
             : (vldi::const_u16<x>::valid
                    ? __lsx_vldi(vldi::const_u16<x>::valid
                                     ? vldi::const_u16<x>::value
                                     : 0)
                    : __lsx_vreplgr2vr_h(x));
}

template <uint32_t x> constexpr __m128i lsx_splat_u32_aux() {
  return ((int32_t(x) < 512) && (int32_t(x) > -512))
             ? __lsx_vrepli_w(
                   ((int32_t(x) < 512) && (int32_t(x) > -512)) ? int32_t(x) : 0)
             : (vldi::const_u32<x>::valid
                    ? __lsx_vldi(vldi::const_u32<x>::valid
                                     ? vldi::const_u32<x>::value
                                     : 0)
                    : __lsx_vreplgr2vr_w(x));
}
} // namespace
    #define lsx_splat_u16(v) lsx_splat_u16_aux<(v)>()
    #define lsx_splat_u32(v) lsx_splat_u32_aux<(v)>()
  #endif // QEMU_VLDI_BUG
#endif   // lsx_splat_u16
#endif   //  SIMDUTF_LSX_INTRINSICS_H
/* end file src/simdutf/lsx/intrinsics.h */
/* begin file src/simdutf/lsx/bitmanipulation.h */
#ifndef SIMDUTF_LSX_BITMANIPULATION_H
#define SIMDUTF_LSX_BITMANIPULATION_H

#include <limits>

namespace simdutf {
namespace lsx {
namespace {

simdutf_really_inline int count_ones(uint64_t input_num) {
  return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
}

#if SIMDUTF_NEED_TRAILING_ZEROES
simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
  return __builtin_ctzll(input_num);
}
#endif

} // unnamed namespace
} // namespace lsx
} // namespace simdutf

#endif // SIMDUTF_LSX_BITMANIPULATION_H
/* end file src/simdutf/lsx/bitmanipulation.h */
/* begin file src/simdutf/lsx/simd.h */
#ifndef SIMDUTF_LSX_SIMD_H
#define SIMDUTF_LSX_SIMD_H


namespace simdutf {
namespace lsx {
namespace {
namespace simd {

template <typename T> struct simd8;

//
// Base class of simd8<uint8_t> and simd8<bool>, both of which use __m128i
// internally.
//
template <typename T, typename Mask = simd8<bool>> struct base_u8 {
  __m128i value;
  static const int SIZE = sizeof(value);

  // Conversion from/to SIMD register
  simdutf_really_inline base_u8(const __m128i _value) : value(_value) {}
  simdutf_really_inline operator const __m128i &() const { return this->value; }
  simdutf_really_inline operator __m128i &() { return this->value; }

  // Bit operations
  simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
    return __lsx_vor_v(this->value, other);
  }
  simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
    return __lsx_vand_v(this->value, other);
  }
  simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
    return __lsx_vxor_v(this->value, other);
  }
  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
  simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
    auto this_cast = static_cast<simd8<T> *>(this);
    *this_cast = *this_cast | other;
    return *this_cast;
  }

  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
                                               const simd8<T> rhs) {
    return __lsx_vseq_b(lhs, rhs);
  }

  template <int N = 1>
  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
    return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
                       __lsx_vbsrl_v(prev_chunk.value, 16 - N));
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd8<bool> : base_u8<bool> {
  typedef uint16_t bitmask_t;
  typedef uint32_t bitmask2_t;

  static simdutf_really_inline simd8<bool> splat(bool _value) {
    return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
  }

  simdutf_really_inline simd8(const __m128i _value) : base_u8<bool>(_value) {}
  // False constructor
  simdutf_really_inline simd8() : simd8(__lsx_vldi(0)) {}
  // Splat constructor
  simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
  simdutf_really_inline void store(uint8_t dst[16]) const {
    return __lsx_vst(this->value, dst, 0);
  }

  simdutf_really_inline uint32_t to_bitmask() const {
    return __lsx_vpickve2gr_wu(__lsx_vmsknz_b(*this), 0);
  }
};

// Unsigned bytes
template <> struct simd8<uint8_t> : base_u8<uint8_t> {
  static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
    return __lsx_vreplgr2vr_b(_value);
  }
  static simdutf_really_inline simd8<uint8_t> zero() { return __lsx_vldi(0); }
  static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
    return __lsx_vld(values, 0);
  }
  simdutf_really_inline simd8(const __m128i _value)
      : base_u8<uint8_t>(_value) {}
  // Zero constructor
  simdutf_really_inline simd8() : simd8(zero()) {}
  // Array constructor
  simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
  // Splat constructor
  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
  // Member-by-member initialization
  simdutf_really_inline
  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
      : simd8((__m128i)v16u8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
                             v12, v13, v14, v15}) {}

  // Repeat 16 values as many times as necessary (usually for lookup tables)
  simdutf_really_inline static simd8<uint8_t>
  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
            uint8_t v15) {
    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
                          v13, v14, v15);
  }

  // Store to array
  simdutf_really_inline void store(uint8_t dst[16]) const {
    return __lsx_vst(this->value, dst, 0);
  }

  // Order-specific operations
  simdutf_really_inline simd8<bool>
  operator>=(const simd8<uint8_t> other) const {
    return __lsx_vsle_bu(other, *this);
  }
  simdutf_really_inline simd8<bool>
  operator>(const simd8<uint8_t> other) const {
    return __lsx_vslt_bu(other, *this);
  }
  simdutf_really_inline simd8 &operator-=(const simd8<uint8_t> other) {
    value = __lsx_vsub_b(value, other.value);
    return *this;
  }
  // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
  // = nonzero. For ARM, returns all 1's.
  simdutf_really_inline simd8<uint8_t>
  gt_bits(const simd8<uint8_t> other) const {
    return simd8<uint8_t>(*this > other);
  }

  // Bit-specific operations
  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
    return __lsx_vslt_bu(__lsx_vldi(0), __lsx_vand_v(this->value, bits));
  }
  simdutf_really_inline bool is_ascii() const {
    return __lsx_vpickve2gr_hu(__lsx_vmskgez_b(this->value), 0) == 0xFFFF;
  }

  simdutf_really_inline bool any_bits_set_anywhere() const {
    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(this->value), 0) > 0;
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
    return __lsx_vsrli_b(this->value, N);
  }
  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
    return __lsx_vslli_b(this->value, N);
  }

  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
  // for out of range values)
  template <typename L>
  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
    return lookup_table.apply_lookup_16_to(*this);
  }

  template <typename L>
  simdutf_really_inline simd8<L>
  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
            L replace5, L replace6, L replace7, L replace8, L replace9,
            L replace10, L replace11, L replace12, L replace13, L replace14,
            L replace15) const {
    return lookup_16(simd8<L>::repeat_16(
        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
        replace7, replace8, replace9, replace10, replace11, replace12,
        replace13, replace14, replace15));
  }

  template <typename T>
  simdutf_really_inline simd8<uint8_t>
  apply_lookup_16_to(const simd8<T> original) const {
    __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
    return __lsx_vshuf_b(__lsx_vldi(0), *this, simd8<uint8_t>(original_tmp));
  }

  simdutf_really_inline uint64_t sum_bytes() const {
    const auto sum_u16 = __lsx_vhaddw_hu_bu(value, value);
    const auto sum_u32 = __lsx_vhaddw_wu_hu(sum_u16, sum_u16);
    const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32);

    return uint64_t(__lsx_vpickve2gr_du(sum_u64, 0)) +
           uint64_t(__lsx_vpickve2gr_du(sum_u64, 1));
  }
};

// Signed bytes
template <> struct simd8<int8_t> {
  __m128i value;

  static const int SIZE = sizeof(value);

  static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
    return __lsx_vreplgr2vr_b(_value);
  }
  static simdutf_really_inline simd8<int8_t> zero() { return __lsx_vldi(0); }
  static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
    return __lsx_vld(values, 0);
  }

  template <endianness big_endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
    __m128i zero = __lsx_vldi(0);
    if constexpr (match_system(big_endian)) {
      __lsx_vst(__lsx_vilvl_b(zero, (__m128i)this->value),
                reinterpret_cast<uint16_t *>(p), 0);
      __lsx_vst(__lsx_vilvh_b(zero, (__m128i)this->value),
                reinterpret_cast<uint16_t *>(p + 8), 0);
    } else {
      __lsx_vst(__lsx_vilvl_b((__m128i)this->value, zero),
                reinterpret_cast<uint16_t *>(p), 0);
      __lsx_vst(__lsx_vilvh_b((__m128i)this->value, zero),
                reinterpret_cast<uint16_t *>(p + 8), 0);
    }
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
    __m128i zero = __lsx_vldi(0);
    __m128i in16low = __lsx_vilvl_b(zero, (__m128i)this->value);
    __m128i in16high = __lsx_vilvh_b(zero, (__m128i)this->value);
    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(p), 0);
    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(p + 4), 0);
    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(p + 8), 0);
    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(p + 12), 0);
  }

  // In places where the table can be reused, which is most uses in simdutf, it
  // is worth it to do 4 table lookups, as there is no direct zero extension
  // from u8 to u32.
  simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
    const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
                             2, 255, 255, 255, 3, 255, 255, 255};
    const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
                             6, 255, 255, 255, 7, 255, 255, 255};
    const simd8<uint8_t> tb3{8,  255, 255, 255, 9,  255, 255, 255,
                             10, 255, 255, 255, 11, 255, 255, 255};
    const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
                             14, 255, 255, 255, 15, 255, 255, 255};

    // encourage store pairing and interleaving
    const auto shuf1 = this->apply_lookup_16_to(tb1);
    const auto shuf2 = this->apply_lookup_16_to(tb2);
    shuf1.store(reinterpret_cast<int8_t *>(p));
    shuf2.store(reinterpret_cast<int8_t *>(p + 4));

    const auto shuf3 = this->apply_lookup_16_to(tb3);
    const auto shuf4 = this->apply_lookup_16_to(tb4);
    shuf3.store(reinterpret_cast<int8_t *>(p + 8));
    shuf4.store(reinterpret_cast<int8_t *>(p + 12));
  }
  // Conversion from/to SIMD register
  simdutf_really_inline simd8(const __m128i _value) : value(_value) {}

  // Zero constructor
  simdutf_really_inline simd8() : simd8(zero()) {}
  // Splat constructor
  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
  // Array constructor
  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}

  // Store to array
  simdutf_really_inline void store(int8_t dst[16]) const {
    return __lsx_vst(value, dst, 0);
  }

  simdutf_really_inline operator simd8<uint8_t>() const {
    return ((__m128i)this->value);
  }

  simdutf_really_inline simd8<int8_t>
  operator|(const simd8<int8_t> other) const {
    return __lsx_vor_v((__m128i)value, (__m128i)other.value);
  }

  simdutf_really_inline bool is_ascii() const {
    return (__lsx_vpickve2gr_hu(__lsx_vmskgez_b((__m128i)this->value), 0) ==
            0xffff);
  }

  // Order-sensitive comparisons
  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
    return __lsx_vslt_b((__m128i)other.value, (__m128i)value);
  }
  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
    return __lsx_vslt_b((__m128i)value, (__m128i)other.value);
  }

  template <int N = 1>
  simdutf_really_inline simd8<int8_t>
  prev(const simd8<int8_t> prev_chunk) const {
    return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
                       __lsx_vbsrl_v(prev_chunk.value, 16 - N));
  }

  template <typename T>
  simdutf_really_inline simd8<int8_t>
  apply_lookup_16_to(const simd8<T> original) const {
    __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
    return __lsx_vshuf_b(__lsx_vldi(0), (__m128i)this->value,
                         simd8<uint8_t>(original_tmp));
  }
};

template <typename T> struct simd8x64 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
  static_assert(
      NUM_CHUNKS == 4,
      "LoongArch kernel should use four registers per 64-byte block.");
  simd8<T> chunks[NUM_CHUNKS];

  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
  simd8x64<T> &
  operator=(const simd8<T> other) = delete; // no assignment allowed
  simd8x64() = delete;                      // no default constructor allowed

  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
                                 const simd8<T> chunk2, const simd8<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd8x64(const T *ptr)
      : chunks{simd8<T>::load(ptr),
               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
               simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
    this->chunks[0] |= other.chunks[0];
    this->chunks[1] |= other.chunks[1];
    this->chunks[2] |= other.chunks[2];
    this->chunks[3] |= other.chunks[3];
    return *this;
  }

  simdutf_really_inline simd8<T> reduce_or() const {
    return (this->chunks[0] | this->chunks[1]) |
           (this->chunks[2] | this->chunks[3]);
  }

  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }

  template <endianness endian>
  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 0);
    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 1);
    this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 2);
    this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
                                                          sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
    this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
    this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
    this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
    this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
  }

  simdutf_really_inline uint64_t to_bitmask() const {
    __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[3]), 6);
    mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[2]), 4));
    mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[1]), 2));
    mask = __lsx_vor_v(mask, __lsx_vmsknz_b(this->chunks[0]));
    return __lsx_vpickve2gr_du(mask, 0);
  }

  simdutf_really_inline uint64_t lt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
                          this->chunks[2] < mask, this->chunks[3] < mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gt(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
                          this->chunks[2] > mask, this->chunks[3] > mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd8<T> mask = simd8<T>::splat(m);
    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
                          this->chunks[2] >= mask, this->chunks[3] >= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
    return simd8x64<bool>(simd8<uint8_t>(this->chunks[0].value) >= mask,
                          simd8<uint8_t>(this->chunks[1].value) >= mask,
                          simd8<uint8_t>(this->chunks[2].value) >= mask,
                          simd8<uint8_t>(this->chunks[3].value) >= mask)
        .to_bitmask();
  }
}; // struct simd8x64<T>

/* begin file src/simdutf/lsx/simd16-inl.h */
template <typename T> struct simd16;

template <typename T, typename Mask = simd16<bool>> struct base_u16 {
  __m128i value;
  static const size_t SIZE = sizeof(value);
  static const size_t ELEMENTS = sizeof(value) / sizeof(T);

  // Conversion from/to SIMD register
  simdutf_really_inline base_u16() = default;
  simdutf_really_inline base_u16(const __m128i _value) : value(_value) {}
  // Bit operations
  simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
    return __lsx_vor_v(this->value, other.value);
  }
  simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
    return __lsx_vand_v(this->value, other.value);
  }
  simdutf_really_inline simd16<T> operator~() const {
    return __lsx_vxori_b(this->value, 0xFF);
  }

  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
                                               const simd16<T> rhs) {
    return __lsx_vseq_h(lhs.value, rhs.value);
  }

  template <unsigned N>
  simdutf_really_inline simd16<T> byte_right_shift() const {
    return __lsx_vbsrl_v(this->value, N);
  }

  simdutf_really_inline uint16_t first() const {
    return uint16_t(__lsx_vpickve2gr_w(value, 0));
  }
};

template <typename T, typename Mask = simd16<bool>>
struct base16 : base_u16<T> {
  using bitmask_type = uint16_t;

  simdutf_really_inline base16() : base_u16<T>() {}
  simdutf_really_inline base16(const __m128i _value) : base_u16<T>(_value) {}
  template <typename Pointer>
  simdutf_really_inline base16(const Pointer *ptr)
      : base16(__lsx_vld(ptr, 0)) {}

  static const int SIZE = sizeof(base_u16<T>::value);

  template <unsigned N = 1>
  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
    return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
                       __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
  }
};

// SIMD byte mask type (returned by things like eq and gt)
template <> struct simd16<bool> : base16<bool> {
  static simdutf_really_inline simd16<bool> splat(bool _value) {
    return __lsx_vreplgr2vr_h(uint16_t(-(!!_value)));
  }

  simdutf_really_inline simd16() : base16() {}
  simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}

  simdutf_really_inline bitmask_type to_bitmask() const {
    __m128i mask = __lsx_vmsknz_b(this->value);
    bitmask_type mask0 = bitmask_type(__lsx_vpickve2gr_wu(mask, 0));
    return mask0;
  }

  simdutf_really_inline bool is_zero() const { return __lsx_bz_v(this->value); }
};

template <typename T> struct base16_numeric : base16<T> {
  static simdutf_really_inline simd16<T> splat(T _value) {
    return __lsx_vreplgr2vr_h(_value);
  }
  static simdutf_really_inline simd16<T> zero() { return __lsx_vldi(0); }

  template <typename Pointer>
  static simdutf_really_inline simd16<T> load(const Pointer values) {
    return __lsx_vld(values, 0);
  }

  simdutf_really_inline base16_numeric(const __m128i _value)
      : base16<T>(_value) {}

  // Store to array
  simdutf_really_inline void store(T dst[8]) const {
    return __lsx_vst(this->value, dst, 0);
  }

  // Override to distinguish from bool version
  simdutf_really_inline simd16<T> operator~() const {
    return __lsx_vxori_b(this->value, 0xFF);
  }
};

// Unsigned code unitstemplate<>
template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
  simdutf_really_inline simd16(const __m128i _value)
      : base16_numeric<uint16_t>((__m128i)_value) {}

  // Splat constructor
  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}

  // Array constructor
  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
  simdutf_really_inline simd16(const char16_t *values)
      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}

  // Copy constructor
  simdutf_really_inline simd16(const simd16<bool> mask) : simd16(mask.value) {}

  // Order-specific operations
  simdutf_really_inline simd16 &operator+=(const simd16 other) {
    value = __lsx_vadd_h(value, other.value);
    return *this;
  }

  template <unsigned N>
  static simdutf_really_inline simd8<uint8_t>
  pack_shifted_right(const simd16<uint16_t> &v0, const simd16<uint16_t> &v1) {
    return __lsx_vssrlni_bu_h(v1.value, v0.value, N);
  }

  // Pack with the unsigned saturation of two uint16_t code units into single
  // uint8_t vector
  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
                                                   const simd16<uint16_t> &v1) {
    return pack_shifted_right<0>(v0, v1);
  }

  // Change the endianness
  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
    return __lsx_vshuf4i_b(this->value, 0b10110001);
  }

  simdutf_really_inline uint64_t sum() const {
    const auto sum_u32 = __lsx_vhaddw_wu_hu(value, value);
    const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32);

    return uint64_t(__lsx_vpickve2gr_du(sum_u64, 0)) +
           uint64_t(__lsx_vpickve2gr_du(sum_u64, 1));
  }
};

simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> a,
                                             const simd16<uint16_t> b) {
  return __lsx_vslt_hu(a.value, b.value);
}

simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> a,
                                             const simd16<uint16_t> b) {
  return __lsx_vslt_hu(b.value, a.value);
}

simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> a,
                                              const simd16<uint16_t> b) {
  return __lsx_vsle_hu(a.value, b.value);
}

simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> a,
                                              const simd16<uint16_t> b) {
  return __lsx_vsle_hu(b.value, a.value);
}

template <typename T> struct simd16x32 {
  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
  static_assert(
      NUM_CHUNKS == 4,
      "LOONGARCH kernel should use four registers per 64-byte block.");
  simd16<T> chunks[NUM_CHUNKS];

  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
  simd16x32<T> &
  operator=(const simd16<T> other) = delete; // no assignment allowed
  simd16x32() = delete;                      // no default constructor allowed

  simdutf_really_inline
  simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
            const simd16<T> chunk2, const simd16<T> chunk3)
      : chunks{chunk0, chunk1, chunk2, chunk3} {}
  simdutf_really_inline simd16x32(const T *ptr)
      : chunks{simd16<T>::load(ptr),
               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
               simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}

  simdutf_really_inline void store(T *ptr) const {
    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
    this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
    this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
  }

  simdutf_really_inline void swap_bytes() {
    this->chunks[0] = this->chunks[0].swap_bytes();
    this->chunks[1] = this->chunks[1].swap_bytes();
    this->chunks[2] = this->chunks[2].swap_bytes();
    this->chunks[3] = this->chunks[3].swap_bytes();
  }
  simdutf_really_inline uint64_t to_bitmask() const {
    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
    uint64_t r1 = this->chunks[1].to_bitmask();
    uint64_t r2 = this->chunks[2].to_bitmask();
    uint64_t r3 = this->chunks[3].to_bitmask();
    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
  }
  simdutf_really_inline uint64_t gteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
                           this->chunks[2] >= mask, this->chunks[3] >= mask)
        .to_bitmask();
  }
  simdutf_really_inline uint64_t lteq(const T m) const {
    const simd16<T> mask = simd16<T>::splat(m);
    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
                           this->chunks[2] <= mask, this->chunks[3] <= mask)
        .to_bitmask();
  }
}; // struct simd16x32<T>

simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> a,
                                                 uint16_t b) {
  const auto bv = __lsx_vreplgr2vr_h(b);
  return __lsx_vxor_v(a.value, bv);
}

simdutf_really_inline simd16<bool> operator^(const simd16<bool> a,
                                             const simd16<bool> b) {
  return __lsx_vxor_v(a.value, b.value);
}

simdutf_really_inline simd16<uint16_t> min(const simd16<uint16_t> a,
                                           const simd16<uint16_t> b) {
  return __lsx_vmin_hu(a.value, b.value);
}

simdutf_really_inline simd16<uint16_t> as_vector_u16(const simd16<bool> x) {
  return x.value;
}
/* end file src/simdutf/lsx/simd16-inl.h */
/* begin file src/simdutf/lsx/simd32-inl.h */
template <typename T> struct simd32;

template <> struct simd32<uint32_t> {
  __m128i value;
  static const int SIZE = sizeof(value);
  static const int ELEMENTS = SIZE / sizeof(uint32_t);

  // constructors
  simdutf_really_inline simd32(__m128i v) : value(v) {}

  template <typename Ptr>
  simdutf_really_inline simd32(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {}

  // in-place operators
  simdutf_really_inline simd32 &operator-=(const simd32 other) {
    value = __lsx_vsub_w(value, other.value);
    return *this;
  }

  // members
  simdutf_really_inline uint64_t sum() const {
    return uint64_t(__lsx_vpickve2gr_wu(value, 0)) +
           uint64_t(__lsx_vpickve2gr_wu(value, 1)) +
           uint64_t(__lsx_vpickve2gr_wu(value, 2)) +
           uint64_t(__lsx_vpickve2gr_wu(value, 3));
  }

  // static members
  static simdutf_really_inline simd32<uint32_t> splat(uint32_t x) {
    return __lsx_vreplgr2vr_w(x);
  }

  static simdutf_really_inline simd32<uint32_t> zero() {
    return __lsx_vrepli_w(0);
  }
};

// ------------------------------------------------------------

template <> struct simd32<bool> {
  __m128i value;
  static const int SIZE = sizeof(value);

  // constructors
  simdutf_really_inline simd32(__m128i v) : value(v) {}
};

// ------------------------------------------------------------

simdutf_really_inline simd32<uint32_t> operator&(const simd32<uint32_t> a,
                                                 const simd32<uint32_t> b) {
  return __lsx_vor_v(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator<(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return __lsx_vslt_wu(a.value, b.value);
}

simdutf_really_inline simd32<bool> operator>(const simd32<uint32_t> a,
                                             const simd32<uint32_t> b) {
  return __lsx_vslt_wu(b.value, a.value);
}

// ------------------------------------------------------------

simdutf_really_inline simd32<uint32_t> as_vector_u32(const simd32<bool> v) {
  return v.value;
}
/* end file src/simdutf/lsx/simd32-inl.h */
/* begin file src/simdutf/lsx/simd64-inl.h */
template <typename T> struct simd64;

template <> struct simd64<uint64_t> {
  __m128i value;
  static const int SIZE = sizeof(value);
  static const int ELEMENTS = SIZE / sizeof(uint64_t);

  // constructors
  simdutf_really_inline simd64(__m128i v) : value(v) {}

  template <typename Ptr>
  simdutf_really_inline simd64(Ptr *ptr) : value(__lsx_vld(ptr, 0)) {}

  // in-place operators
  simdutf_really_inline simd64 &operator+=(const simd64 other) {
    value = __lsx_vadd_d(value, other.value);
    return *this;
  }

  // members
  simdutf_really_inline uint64_t sum() const {
    return uint64_t(__lsx_vpickve2gr_du(value, 0)) +
           uint64_t(__lsx_vpickve2gr_du(value, 1));
  }

  // static members
  static simdutf_really_inline simd64<uint64_t> zero() {
    return __lsx_vrepli_d(0);
  }
};

// ------------------------------------------------------------

template <> struct simd64<bool> {
  __m128i value;
  static const int SIZE = sizeof(value);

  // constructors
  simdutf_really_inline simd64(__m128i v) : value(v) {}
};

// ------------------------------------------------------------

simd64<uint64_t> sum_8bytes(const simd8<uint8_t> v) {
  const auto sum_u16 = __lsx_vhaddw_hu_bu(v, v);
  const auto sum_u32 = __lsx_vhaddw_wu_hu(sum_u16, sum_u16);
  const auto sum_u64 = __lsx_vhaddw_du_wu(sum_u32, sum_u32);

  return simd64<uint64_t>(sum_u64);
}
/* end file src/simdutf/lsx/simd64-inl.h */

} // namespace simd
} // unnamed namespace
} // namespace lsx
} // namespace simdutf

#endif // SIMDUTF_LSX_SIMD_H
/* end file src/simdutf/lsx/simd.h */

/* begin file src/simdutf/lsx/end.h */
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
/* end file src/simdutf/lsx/end.h */

#endif // SIMDUTF_IMPLEMENTATION_LSX

#endif // SIMDUTF_LSX_H
/* end file src/simdutf/lsx.h */
/* begin file src/simdutf/fallback.h */
#ifndef SIMDUTF_FALLBACK_H
#define SIMDUTF_FALLBACK_H


// Note that fallback.h is always imported last.

// Default Fallback to on unless a builtin implementation has already been
// selected.
#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
  #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ||        \
      SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE ||     \
      SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV ||            \
      SIMDUTF_CAN_ALWAYS_RUN_LSX || SIMDUTF_CAN_ALWAYS_RUN_LASX
    #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
  #else
    #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
  #endif
#endif

#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)

#if SIMDUTF_IMPLEMENTATION_FALLBACK

namespace simdutf {
/**
 * Fallback implementation (runs on any machine).
 */
namespace fallback {} // namespace fallback
} // namespace simdutf

/* begin file src/simdutf/fallback/implementation.h */
#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
#define SIMDUTF_FALLBACK_IMPLEMENTATION_H


namespace simdutf {
namespace fallback {

namespace {
using namespace simdutf;
}

class implementation final : public simdutf::implementation {
public:
  simdutf_really_inline implementation()
      : simdutf::implementation("fallback", "Generic fallback implementation",
                                0) {}

  simdutf_warn_unused bool validate_utf8(const char *buf,
                                         size_t len) const noexcept final;

  simdutf_warn_unused result
  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_ascii(const char *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result
  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;

  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                          size_t len) const noexcept final;
  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *buf, size_t len, char *utf8_output) const noexcept final;

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len, char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;

  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;

  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final;
  simdutf_warn_unused result
  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                      char *latin1_output) const noexcept final;
  simdutf_warn_unused size_t
  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                char *latin1_output) const noexcept final;

  simdutf_warn_unused size_t count_utf8(const char *buf,
                                        size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf32_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t latin1_length_from_utf8(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused size_t utf8_length_from_latin1(
      const char *input, size_t length) const noexcept override;

  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_options =
          last_chunk_handling_options::loose) const noexcept override;
  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override;
  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override;
  const char *find(const char *start, const char *end,
                   char character) const noexcept override;
  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override;

};
} // namespace fallback
} // namespace simdutf

#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
/* end file src/simdutf/fallback/implementation.h */

/* begin file src/simdutf/fallback/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "fallback"
// #define SIMDUTF_IMPLEMENTATION fallback
/* end file src/simdutf/fallback/begin.h */

  // Declarations
/* begin file src/simdutf/fallback/bitmanipulation.h */
#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
#define SIMDUTF_FALLBACK_BITMANIPULATION_H

#include <limits>

namespace simdutf {
namespace fallback {
namespace {} // unnamed namespace
} // namespace fallback
} // namespace simdutf

#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
/* end file src/simdutf/fallback/bitmanipulation.h */

/* begin file src/simdutf/fallback/end.h */
/* end file src/simdutf/fallback/end.h */

#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
#endif // SIMDUTF_FALLBACK_H
/* end file src/simdutf/fallback.h */
#ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
SIMDUTF_POP_DISABLE_WARNINGS
#endif

// The scalar routines should be included once.


/* begin file src/implementation.cpp */
#include <algorithm>
#include <climits>
#include <type_traits>
#include <utility>
#if SIMDUTF_ATOMIC_REF
  #include <array>
#endif

// The macro SIMDUTF_USE_STATIC_INITIALIZATION, when set to 1, means that we
// will use translation-unit-scope variables to hold our implementations.
//
// The downside of a translation-unit-scope variable is that the initialization
// order is not well defined, thus if someone uses simdutf before main() starts,
// they might get a crash. Thus setting SIMDUTF_USE_STATIC_INITIALIZATION to 1
// is not recommended if you are using simdutf in a library that might be used
// by other code before main() starts. However, the upside is that there is no
// synchronization overhead on every call to get_active_implementation(). When
// compiling without the c++ standard library, we use static initialization,
// because C++ relies on the standard library for thread-safe initialization of
// function-scope static variables.
//
// By default, we avoid translation-unit-scope static initialization, so we set
// SIMDUTF_USE_STATIC_INITIALIZATION to 0. It comes with a small performance
// cost on the first call to get_active_implementation(), and a smaller cost on
// subsequent calls but it is then safe to use the simdutf library in static
// initialization.
//
// Further reading: https://en.cppreference.com/cpp/language/siof
#ifndef SIMDUTF_USE_STATIC_INITIALIZATION
  #if SIMDUTF_NO_LIBCXX
    #define SIMDUTF_USE_STATIC_INITIALIZATION 1
  #else // SIMDUTF_NO_LIBCXX
    #define SIMDUTF_USE_STATIC_INITIALIZATION 0
  #endif // SIMDUTF_NO_LIBCXX
#endif   // SIMDUTF_USE_STATIC_INITIALIZATION

// When building without libc++abi (SIMDUTF_NO_LIBCXX=1) on GCC/Clang, provide
// a weak stub for __cxa_pure_virtual so the abstract implementation vtable
// does not drag in libc++abi just for this unreachable hook. Kept weak so a
// real libc++abi definition wins if one is linked in anyway.
#if SIMDUTF_NO_LIBCXX
extern "C" __attribute__((weak, noreturn)) void __cxa_pure_virtual() {
  __builtin_trap();
}
namespace std {
__attribute__((weak, noreturn)) void
__glibcxx_assert_fail(const char *, int, const char *, const char *) noexcept {
  __builtin_trap();
}
} // namespace std
#endif

static_assert(sizeof(uint8_t) == sizeof(char),
              "simdutf requires that uint8_t be a char");
static_assert(sizeof(uint16_t) == sizeof(char16_t),
              "simdutf requires that char16_t be 16 bits");
static_assert(sizeof(uint32_t) == sizeof(char32_t),
              "simdutf requires that char32_t be 32 bits");
// next line is redundant, but it is kept to catch defective systems.
static_assert(CHAR_BIT == 8, "simdutf requires 8-bit bytes");

namespace simdutf {
bool implementation::supported_by_runtime_system() const {
  uint32_t required_instruction_sets = this->required_instruction_sets();
  uint32_t supported_instruction_sets =
      internal::detect_supported_architectures();
  return ((supported_instruction_sets & required_instruction_sets) ==
          required_instruction_sets);
}

simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return scalar::base64::maximal_binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return scalar::base64::maximal_binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return scalar::base64::binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return scalar::base64::binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::base64_length_from_binary(
    size_t length, base64_options options) const noexcept {
  return scalar::base64::base64_length_from_binary(length, options);
}

namespace internal {
// When there is a single implementation, we should not pay a price
// for dispatching to the best implementation. We should just use the
// one we have. This is a compile-time check.
#define SIMDUTF_SINGLE_IMPLEMENTATION                                          \
  (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL +           \
       SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 +        \
       SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_LSX +             \
       SIMDUTF_IMPLEMENTATION_LASX + SIMDUTF_IMPLEMENTATION_FALLBACK ==        \
   1)

#if SIMDUTF_IMPLEMENTATION_ICELAKE
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const icelake::implementation icelake_singleton{};
  #endif
static const icelake::implementation *get_icelake_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const icelake::implementation icelake_singleton{};
  #endif
  return &icelake_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_HASWELL
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const haswell::implementation haswell_singleton{};
  #endif
static const haswell::implementation *get_haswell_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const haswell::implementation haswell_singleton{};
  #endif
  return &haswell_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_WESTMERE
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const westmere::implementation westmere_singleton{};
  #endif
static const westmere::implementation *get_westmere_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const westmere::implementation westmere_singleton{};
  #endif
  return &westmere_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_ARM64
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const arm64::implementation arm64_singleton{};
  #endif
static const arm64::implementation *get_arm64_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const arm64::implementation arm64_singleton{};
  #endif
  return &arm64_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_PPC64
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const ppc64::implementation ppc64_singleton{};
  #endif
static const ppc64::implementation *get_ppc64_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const ppc64::implementation ppc64_singleton{};
  #endif
  return &ppc64_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_RVV
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const rvv::implementation rvv_singleton{};
  #endif
static const rvv::implementation *get_rvv_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const rvv::implementation rvv_singleton{};
  #endif
  return &rvv_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_LASX
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const lasx::implementation lasx_singleton{};
  #endif
static const lasx::implementation *get_lasx_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const lasx::implementation lasx_singleton{};
  #endif
  return &lasx_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_LSX
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const lsx::implementation lsx_singleton{};
  #endif
static const lsx::implementation *get_lsx_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const lsx::implementation lsx_singleton{};
  #endif
  return &lsx_singleton;
}
#endif
#if SIMDUTF_IMPLEMENTATION_FALLBACK
  #if SIMDUTF_USE_STATIC_INITIALIZATION
static const fallback::implementation fallback_singleton{};
  #endif
static const fallback::implementation *get_fallback_singleton() {
  #if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const fallback::implementation fallback_singleton{};
  #endif
  return &fallback_singleton;
}
#endif

#if SIMDUTF_SINGLE_IMPLEMENTATION
simdutf_really_inline static const implementation *get_single_implementation() {
  return
  #if SIMDUTF_IMPLEMENTATION_ICELAKE
      get_icelake_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_HASWELL
  get_haswell_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_WESTMERE
  get_westmere_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_ARM64
  get_arm64_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_PPC64
  get_ppc64_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_LASX
  get_lasx_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_LSX
  get_lsx_singleton();
  #endif
  #if SIMDUTF_IMPLEMENTATION_FALLBACK
  get_fallback_singleton();
  #endif
}
#endif

/**
 * @private Detects best supported implementation on first use, and sets it
 */
class detect_best_supported_implementation_on_first_use final
    : public implementation {
public:
  std::string_view name() const noexcept final { return set_best()->name(); }
  std::string_view description() const noexcept final {
    return set_best()->description();
  }
  uint32_t required_instruction_sets() const noexcept final {
    return set_best()->required_instruction_sets();
  }

  simdutf_warn_unused bool
  validate_utf8(const char *buf, size_t len) const noexcept final override {
    return set_best()->validate_utf8(buf, len);
  }

  simdutf_warn_unused result validate_utf8_with_errors(
      const char *buf, size_t len) const noexcept final override {
    return set_best()->validate_utf8_with_errors(buf, len);
  }

  simdutf_warn_unused bool
  validate_ascii(const char *buf, size_t len) const noexcept final override {
    return set_best()->validate_ascii(buf, len);
  }
  simdutf_warn_unused result validate_ascii_with_errors(
      const char *buf, size_t len) const noexcept final override {
    return set_best()->validate_ascii_with_errors(buf, len);
  }

  simdutf_warn_unused bool
  validate_utf32(const char32_t *buf,
                 size_t len) const noexcept final override {
    return set_best()->validate_utf32(buf, len);
  }

  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *buf, size_t len) const noexcept final override {
    return set_best()->validate_utf32_with_errors(buf, len);
  }

  simdutf_warn_unused size_t
  convert_latin1_to_utf8(const char *buf, size_t len,
                         char *utf8_output) const noexcept final override {
    return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
  }

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *buf, size_t len,
      char32_t *latin1_output) const noexcept final override {
    return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
  }

  simdutf_warn_unused size_t
  convert_utf8_to_latin1(const char *buf, size_t len,
                         char *latin1_output) const noexcept final override {
    return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
  }

  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *buf, size_t len,
      char *latin1_output) const noexcept final override {
    return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
                                                          latin1_output);
  }

  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *buf, size_t len,
      char *latin1_output) const noexcept final override {
    return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
  }

  simdutf_warn_unused size_t
  convert_utf8_to_utf32(const char *buf, size_t len,
                        char32_t *utf32_output) const noexcept final override {
    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
  }

  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *buf, size_t len,
      char32_t *utf32_output) const noexcept final override {
    return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
                                                         utf32_output);
  }

  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *buf, size_t len,
      char32_t *utf32_output) const noexcept final override {
    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
  }

  simdutf_warn_unused size_t
  convert_utf32_to_latin1(const char32_t *buf, size_t len,
                          char *latin1_output) const noexcept final override {
    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
  }

  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
      const char32_t *buf, size_t len,
      char *latin1_output) const noexcept final override {
    return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
                                                           latin1_output);
  }

  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
      const char32_t *buf, size_t len,
      char *latin1_output) const noexcept final override {
    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
  }

  simdutf_warn_unused size_t
  convert_utf32_to_utf8(const char32_t *buf, size_t len,
                        char *utf8_output) const noexcept final override {
    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
  }

  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *buf, size_t len,
      char *utf8_output) const noexcept final override {
    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  }

  simdutf_warn_unused size_t
  convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
                              char *utf8_output) const noexcept final override {
    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
  }

  simdutf_warn_unused size_t
  count_utf8(const char *buf, size_t len) const noexcept final override {
    return set_best()->count_utf8(buf, len);
  }

  simdutf_warn_unused size_t
  latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
    return set_best()->latin1_length_from_utf8(buf, len);
  }

  simdutf_warn_unused size_t
  utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
    return set_best()->utf8_length_from_latin1(buf, len);
  }

  simdutf_warn_unused size_t utf8_length_from_utf32(
      const char32_t *buf, size_t len) const noexcept override {
    return set_best()->utf8_length_from_utf32(buf, len);
  }

  simdutf_warn_unused size_t
  utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
    return set_best()->utf32_length_from_utf8(buf, len);
  }

  simdutf_warn_unused result base64_to_binary(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_handling_options =
          last_chunk_handling_options::loose) const noexcept override {
    return set_best()->base64_to_binary(input, length, output, options,
                                        last_chunk_handling_options);
  }

  simdutf_warn_unused full_result base64_to_binary_details(
      const char *input, size_t length, char *output, base64_options options,
      last_chunk_handling_options last_chunk_handling_options =
          last_chunk_handling_options::loose) const noexcept override {
    return set_best()->base64_to_binary_details(input, length, output, options,
                                                last_chunk_handling_options);
  }

  simdutf_warn_unused result base64_to_binary(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_handling_options =
          last_chunk_handling_options::loose) const noexcept override {
    return set_best()->base64_to_binary(input, length, output, options,
                                        last_chunk_handling_options);
  }

  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *input, size_t length, char *output,
      base64_options options,
      last_chunk_handling_options last_chunk_handling_options =
          last_chunk_handling_options::loose) const noexcept override {
    return set_best()->base64_to_binary_details(input, length, output, options,
                                                last_chunk_handling_options);
  }

  size_t binary_to_base64(const char *input, size_t length, char *output,
                          base64_options options) const noexcept override {
    return set_best()->binary_to_base64(input, length, output, options);
  }

  size_t
  binary_to_base64_with_lines(const char *input, size_t length, char *output,
                              size_t line_length,
                              base64_options options) const noexcept override {
    return set_best()->binary_to_base64_with_lines(input, length, output,
                                                   line_length, options);
  }

  const char *find(const char *start, const char *end,
                   char character) const noexcept override {
    return set_best()->find(start, end, character);
  }

  const char16_t *find(const char16_t *start, const char16_t *end,
                       char16_t character) const noexcept override {
    return set_best()->find(start, end, character);
  }

  simdutf_warn_unused size_t binary_length_from_base64(
      const char *input, size_t length) const noexcept override {
    return set_best()->binary_length_from_base64(input, length);
  }

  simdutf_warn_unused size_t binary_length_from_base64(
      const char16_t *input, size_t length) const noexcept override {
    return set_best()->binary_length_from_base64(input, length);
  }

  simdutf_really_inline
  detect_best_supported_implementation_on_first_use() noexcept
      : implementation("best_supported_detector",
                       "Detects the best supported implementation and sets it",
                       0) {}

private:
  const implementation *set_best() const noexcept;
};

static_assert(std::is_trivially_destructible<
                  detect_best_supported_implementation_on_first_use>::value,
              "detect_best_supported_implementation_on_first_use should be "
              "trivially destructible");

#if SIMDUTF_USE_STATIC_INITIALIZATION
static const std::initializer_list<const implementation *>
    available_implementation_pointers{
  #if SIMDUTF_IMPLEMENTATION_ICELAKE
        get_icelake_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_HASWELL
        get_haswell_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_WESTMERE
        get_westmere_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_ARM64
        get_arm64_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_PPC64
        get_ppc64_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_RVV
        get_rvv_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_LASX
        get_lasx_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_LSX
        get_lsx_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_FALLBACK
        get_fallback_singleton(),
  #endif
    };
#endif
static const std::initializer_list<const implementation *> &
get_available_implementation_pointers() {
#if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const std::initializer_list<const implementation *>
      available_implementation_pointers{
  #if SIMDUTF_IMPLEMENTATION_ICELAKE
          get_icelake_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_HASWELL
          get_haswell_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_WESTMERE
          get_westmere_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_ARM64
          get_arm64_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_PPC64
          get_ppc64_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_RVV
          get_rvv_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_LASX
          get_lasx_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_LSX
          get_lsx_singleton(),
  #endif
  #if SIMDUTF_IMPLEMENTATION_FALLBACK
          get_fallback_singleton(),
  #endif
      };
#endif
  return available_implementation_pointers;
}

// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
// support
class unsupported_implementation final : public implementation {
public:

  simdutf_warn_unused bool validate_utf8(const char *,
                                         size_t) const noexcept final override {
    return false; // Just refuse to validate. Given that we have a fallback
                  // implementation
    // it seems unlikely that unsupported_implementation will ever be used. If
    // it is used, then it will flag all strings as invalid. The alternative is
    // to return an error_code from which the user has to figure out whether the
    // string is valid UTF-8... which seems like a lot of work just to handle
    // the very unlikely case that we have an unsupported implementation. And,
    // when it does happen (that we have an unsupported implementation), what
    // are the chances that the programmer has a fallback? Given that *we*
    // provide the fallback, it implies that the programmer would need a
    // fallback for our fallback.
  }

  simdutf_warn_unused result validate_utf8_with_errors(
      const char *, size_t) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused bool
  validate_ascii(const char *, size_t) const noexcept final override {
    return false;
  }

  simdutf_warn_unused result validate_ascii_with_errors(
      const char *, size_t) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused bool
  validate_utf32(const char32_t *, size_t) const noexcept final override {
    return false;
  }

  simdutf_warn_unused result validate_utf32_with_errors(
      const char32_t *, size_t) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused size_t convert_latin1_to_utf8(
      const char *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t convert_latin1_to_utf32(
      const char *, size_t, char32_t *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t convert_utf8_to_latin1(
      const char *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
      const char *, size_t, char *) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
      const char *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t convert_utf8_to_utf32(
      const char *, size_t, char32_t *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
      const char *, size_t, char32_t *) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
      const char *, size_t, char32_t *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t convert_utf32_to_latin1(
      const char32_t *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
      const char32_t *, size_t, char *) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
      const char32_t *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t convert_utf32_to_utf8(
      const char32_t *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
      const char32_t *, size_t, char *) const noexcept final override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
      const char32_t *, size_t, char *) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t count_utf8(const char *,
                                        size_t) const noexcept final override {
    return 0;
  }

  simdutf_warn_unused size_t
  latin1_length_from_utf8(const char *, size_t) const noexcept override {
    return 0;
  }

  simdutf_warn_unused size_t
  utf8_length_from_latin1(const char *, size_t) const noexcept override {
    return 0;
  }

  simdutf_warn_unused size_t
  utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
    return 0;
  }

  simdutf_warn_unused size_t
  utf32_length_from_utf8(const char *, size_t) const noexcept override {
    return 0;
  }

  simdutf_warn_unused result
  base64_to_binary(const char *, size_t, char *, base64_options,
                   last_chunk_handling_options) const noexcept override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused full_result base64_to_binary_details(
      const char *, size_t, char *, base64_options,
      last_chunk_handling_options) const noexcept override {
    return full_result(error_code::OTHER, 0, 0);
  }

  simdutf_warn_unused result
  base64_to_binary(const char16_t *, size_t, char *, base64_options,
                   last_chunk_handling_options) const noexcept override {
    return result(error_code::OTHER, 0);
  }

  simdutf_warn_unused full_result base64_to_binary_details(
      const char16_t *, size_t, char *, base64_options,
      last_chunk_handling_options) const noexcept override {
    return full_result(error_code::OTHER, 0, 0);
  }

  size_t binary_to_base64(const char *, size_t, char *,
                          base64_options) const noexcept override {
    return 0;
  }
  size_t binary_to_base64_with_lines(const char *, size_t, char *, size_t,
                                     base64_options) const noexcept override {
    return 0;
  }
  const char *find(const char *, const char *, char) const noexcept override {
    return nullptr;
  }
  const char16_t *find(const char16_t *, const char16_t *,
                       char16_t) const noexcept override {
    return nullptr;
  }
  simdutf_warn_unused size_t
  binary_length_from_base64(const char *, size_t) const noexcept override {
    return 0;
  }
  simdutf_warn_unused size_t
  binary_length_from_base64(const char16_t *, size_t) const noexcept override {
    return 0;
  }

  unsupported_implementation()
      : implementation("unsupported",
                       "Unsupported CPU (no detected SIMD instructions)", 0) {}
};

#if SIMDUTF_USE_STATIC_INITIALIZATION
static const unsupported_implementation unsupported_singleton{};
#endif
const unsupported_implementation *get_unsupported_singleton() {
#if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const unsupported_implementation unsupported_singleton{};
#endif
  return &unsupported_singleton;
}
static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
              "unsupported_singleton should be trivially destructible");

size_t available_implementation_list::size() const noexcept {
  return internal::get_available_implementation_pointers().size();
}
const implementation *const *
available_implementation_list::begin() const noexcept {
  return internal::get_available_implementation_pointers().begin();
}
const implementation *const *
available_implementation_list::end() const noexcept {
  return internal::get_available_implementation_pointers().end();
}
const implementation *
available_implementation_list::detect_best_supported() const noexcept {
  // They are prelisted in priority order, so we just go down the list
  uint32_t supported_instruction_sets =
      internal::detect_supported_architectures();
  for (const implementation *impl :
       internal::get_available_implementation_pointers()) {
    uint32_t required_instruction_sets = impl->required_instruction_sets();
    if ((supported_instruction_sets & required_instruction_sets) ==
        required_instruction_sets) {
      return impl;
    }
  }
  return get_unsupported_singleton(); // this should never happen?
}

const implementation *
detect_best_supported_implementation_on_first_use::set_best() const noexcept {
  SIMDUTF_PUSH_DISABLE_WARNINGS
  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
                                     // manually verified this is safe
      char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
  SIMDUTF_POP_DISABLE_WARNINGS

  if (force_implementation_name) {
    auto force_implementation =
        get_available_implementations()[force_implementation_name];
    if (force_implementation) {
      return get_active_implementation() = force_implementation;
    } else {
      // Note: abort() and stderr usage within the library is forbidden.
      return get_active_implementation() = get_unsupported_singleton();
    }
  }
  return get_active_implementation() =
             get_available_implementations().detect_best_supported();
}

} // namespace internal

/**
 * The list of available implementations compiled into simdutf.
 */
#if SIMDUTF_USE_STATIC_INITIALIZATION
static const internal::available_implementation_list
    available_implementations_instance{};
#endif
SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
get_available_implementations() {
#if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const internal::available_implementation_list
      available_implementations_instance{};
#endif
  return available_implementations_instance;
}

#if SIMDUTF_USE_STATIC_INITIALIZATION && !SIMDUTF_SINGLE_IMPLEMENTATION
static const internal::detect_best_supported_implementation_on_first_use
    detect_best_supported_implementation_on_first_use_singleton;
#endif

#if SIMDUTF_USE_STATIC_INITIALIZATION
static internal::atomic_ptr<const implementation>
    active_implementation_instance{
  #if SIMDUTF_SINGLE_IMPLEMENTATION
        internal::get_single_implementation()
  #else
        &detect_best_supported_implementation_on_first_use_singleton
  #endif
    };
#endif

/**
 * The active implementation.
 */
SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
get_active_implementation() {
#if !SIMDUTF_USE_STATIC_INITIALIZATION
  #if !SIMDUTF_SINGLE_IMPLEMENTATION
  static const internal::detect_best_supported_implementation_on_first_use
      detect_best_supported_implementation_on_first_use_singleton;
  #endif
  static internal::atomic_ptr<const implementation>
      active_implementation_instance{
  #if SIMDUTF_SINGLE_IMPLEMENTATION
          internal::get_single_implementation()
  #else
          &detect_best_supported_implementation_on_first_use_singleton
  #endif
      };
#endif
  return active_implementation_instance;
}

#if SIMDUTF_SINGLE_IMPLEMENTATION
simdutf_really_inline const implementation *get_default_implementation() {
  return internal::get_single_implementation();
}
#else
simdutf_really_inline internal::atomic_ptr<const implementation> &
get_default_implementation() {
  return get_active_implementation();
}
#endif
#define SIMDUTF_GET_CURRENT_IMPLEMENTATION

simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
  return get_default_implementation()->validate_utf8(buf, len);
}
simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
                                                     size_t len) noexcept {
  return get_default_implementation()->validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
  return get_default_implementation()->validate_ascii(buf, len);
}
simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
                                                      size_t len) noexcept {
  return get_default_implementation()->validate_ascii_with_errors(buf, len);
}

simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
                                                  char *utf8_output) noexcept {
  return get_default_implementation()->convert_latin1_to_utf8(buf, len,
                                                              utf8_output);
}

simdutf_warn_unused size_t convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *latin1_output) noexcept {
  return get_default_implementation()->convert_latin1_to_utf32(buf, len,
                                                               latin1_output);
}
// moved to the header file
// simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) noexcept
// simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) noexcept

simdutf_warn_unused size_t convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) noexcept {
  return get_default_implementation()->convert_utf8_to_latin1(buf, len,
                                                              latin1_output);
}
simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) noexcept {
  return get_default_implementation()->convert_utf8_to_latin1_with_errors(
      buf, len, latin1_output);
}
simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) noexcept {
  return get_default_implementation()->convert_valid_utf8_to_latin1(
      buf, len, latin1_output);
}

simdutf_warn_unused size_t convert_utf8_to_utf32(
    const char *input, size_t length, char32_t *utf32_output) noexcept {
  return get_default_implementation()->convert_utf8_to_utf32(input, length,
                                                             utf32_output);
}
simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
    const char *input, size_t length, char32_t *utf32_output) noexcept {
  return get_default_implementation()->convert_utf8_to_utf32_with_errors(
      input, length, utf32_output);
}

  #if SIMDUTF_ATOMIC_REF
template <typename char_type>
simdutf_warn_unused result atomic_base64_to_binary_safe_impl(
    const char_type *input, size_t length, char *output, size_t &outlen,
    base64_options options,
    last_chunk_handling_options last_chunk_handling_options,
    bool decode_up_to_bad_char) noexcept {
    #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
  // We use a smaller buffer during fuzzing to more easily detect bugs.
  constexpr size_t buffer_size = 128;
    #else
  // Arbitrary block sizes: 4KB for input.
  constexpr size_t buffer_size = 4096;
    #endif
  std::array<char, buffer_size> temp_buffer;
  const char_type *const input_init = input;
  size_t actual_out = 0;
  bool last_chunk = false;
  const size_t length_init = length;
  result r;
  while (!last_chunk) {
    last_chunk |= (temp_buffer.size() >= outlen - actual_out);
    size_t temp_outlen = (std::min)(temp_buffer.size(), outlen - actual_out);
    r = base64_to_binary_safe(input, length, temp_buffer.data(), temp_outlen,
                              options, last_chunk_handling_options,
                              decode_up_to_bad_char);
    // We processed r.count characters of input.
    // We wrote temp_outlen bytes to temp_buffer.
    // If there is no ignorable characters,
    // we should expect that values/4.0*3 == temp_outlen,
    // except maybe at the tail end of the string.

    //
    // We are assuming that when r.error == error_code::OUTPUT_BUFFER_TOO_SMALL,
    // we truncate the results so that a number of base64 characters divisible
    // by four is processed.
    //

    //
    // We wrote temp_outlen bytes to temp_buffer.
    // We need to copy them to output.
    // Copy with relaxed atomic operations to the output
    simdutf_log_assert(temp_outlen <= outlen - actual_out,
                       "Output buffer is too small");
    simdutf_log_assert(temp_outlen <= temp_buffer.size(),
                       "Output buffer is too small");

    simdutf::scalar::memcpy_atomic_write(output + actual_out,
                                         temp_buffer.data(), temp_outlen);
    actual_out += temp_outlen;
    length -= r.count;
    input += r.count;

    if (r.error != error_code::OUTPUT_BUFFER_TOO_SMALL) {
      break;
    }
  }
  if (size_t(input - input_init) != length_init) {
    // We did not process all input characters. In such case, we
    // should not end with an ignorable character. See
    // https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    while (input > input_init && base64_ignorable(*(input - 1), options)) {
      --input;
    }
  }
  outlen = actual_out;
  return {r.error, size_t(input - input_init)};
}

simdutf_warn_unused result atomic_base64_to_binary_safe(
    const char *input, size_t length, char *output, size_t &outlen,
    base64_options options,
    last_chunk_handling_options last_chunk_handling_options,
    bool decode_up_to_bad_char) noexcept {
  return atomic_base64_to_binary_safe_impl<char>(
      input, length, output, outlen, options, last_chunk_handling_options,
      decode_up_to_bad_char);
}
simdutf_warn_unused result atomic_base64_to_binary_safe(
    const char16_t *input, size_t length, char *output, size_t &outlen,
    base64_options options,
    last_chunk_handling_options last_chunk_handling_options,
    bool decode_up_to_bad_char) noexcept {
  return atomic_base64_to_binary_safe_impl<char16_t>(
      input, length, output, outlen, options, last_chunk_handling_options,
      decode_up_to_bad_char);
}
  #endif // SIMDUTF_ATOMIC_REF

simdutf_warn_unused bool validate_utf32(const char32_t *buf,
                                        size_t len) noexcept {
  return get_default_implementation()->validate_utf32(buf, len);
}
simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
                                                      size_t len) noexcept {
  return get_default_implementation()->validate_utf32_with_errors(buf, len);
}

simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
    const char *input, size_t length, char32_t *utf32_buffer) noexcept {
  return get_default_implementation()->convert_valid_utf8_to_utf32(
      input, length, utf32_buffer);
}

simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
                                                 size_t len,
                                                 char *utf8_buffer) noexcept {
  return get_default_implementation()->convert_utf32_to_utf8(buf, len,
                                                             utf8_buffer);
}
simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
  return get_default_implementation()->convert_utf32_to_utf8_with_errors(
      buf, len, utf8_buffer);
}
simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
  return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
                                                                   utf8_buffer);
}

simdutf_warn_unused size_t convert_utf32_to_latin1(
    const char32_t *input, size_t length, char *latin1_output) noexcept {
  return get_default_implementation()->convert_utf32_to_latin1(input, length,
                                                               latin1_output);
}
simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
    const char32_t *input, size_t length, char *latin1_buffer) noexcept {
  return get_default_implementation()->convert_utf32_to_latin1_with_errors(
      input, length, latin1_buffer);
}
simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
    const char32_t *input, size_t length, char *latin1_buffer) noexcept {
  return get_default_implementation()->convert_valid_utf32_to_latin1(
      input, length, latin1_buffer);
}

simdutf_warn_unused size_t count_utf8(const char *input,
                                      size_t length) noexcept {
  return get_default_implementation()->count_utf8(input, length);
}

simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
                                                   size_t len) noexcept {
  return get_default_implementation()->latin1_length_from_utf8(buf, len);
}

simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
                                                   size_t len) noexcept {
  return get_default_implementation()->utf8_length_from_latin1(buf, len);
}

simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
                                                  size_t length) noexcept {
  return get_default_implementation()->utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
                                                  size_t length) noexcept {
  return get_default_implementation()->utf32_length_from_utf8(input, length);
}

// this has been moved to implementation.h
// simdutf_warn_unused size_t
// base64_length_from_binary(size_t length, base64_options option) noexcept;

// this has been moved to implementation.h
// simdutf_warn_unused size_t base64_length_from_binary_with_lines(
//     size_t length, base64_options options, size_t line_length) noexcept;
// }

simdutf_warn_unused const char *detail::find(const char *start, const char *end,
                                             char character) noexcept {
  return get_default_implementation()->find(start, end, character);
}
simdutf_warn_unused const char16_t *detail::find(const char16_t *start,
                                                 const char16_t *end,
                                                 char16_t character) noexcept {
  return get_default_implementation()->find(start, end, character);
}

simdutf_warn_unused size_t
maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
  return get_default_implementation()->maximal_binary_length_from_base64(
      input, length);
}

simdutf_warn_unused result base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_handling_options) noexcept {
  return get_default_implementation()->base64_to_binary(
      input, length, output, options, last_chunk_handling_options);
}

simdutf_warn_unused size_t maximal_binary_length_from_base64(
    const char16_t *input, size_t length) noexcept {
  return get_default_implementation()->maximal_binary_length_from_base64(
      input, length);
}

simdutf_warn_unused size_t binary_length_from_base64(const char *input,
                                                     size_t length) noexcept {
  return get_default_implementation()->binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input,
                                                     size_t length) noexcept {
  return get_default_implementation()->binary_length_from_base64(input, length);
}

simdutf_warn_unused result base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_handling_options) noexcept {
  return get_default_implementation()->base64_to_binary(
      input, length, output, options, last_chunk_handling_options);
}

simdutf_warn_unused full_result base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_handling_options) noexcept {
  return get_default_implementation()->base64_to_binary_details(
      input, length, output, options, last_chunk_handling_options);
}

simdutf_warn_unused full_result base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_handling_options) noexcept {
  return get_default_implementation()->base64_to_binary_details(
      input, length, output, options, last_chunk_handling_options);
}

// moved to implementation.h
// simdutf_warn_unused bool base64_ignorable(char input,
//                                           base64_options options) noexcept
// simdutf_warn_unused bool base64_ignorable(char16_t input,
//                                           base64_options options) noexcept
// simdutf_warn_unused bool base64_valid(char input,
//                                       base64_options options) noexcept
// simdutf_warn_unused bool base64_valid(char16_t input,
//                                       base64_options options) noexcept
// simdutf_warn_unused bool
// base64_valid_or_padding(char input, base64_options options) noexcept
// simdutf_warn_unused bool
// base64_valid_or_padding(char16_t input, base64_options options) noexcept

// base64_to_binary_safe_impl is moved to
// include/simdutf/base64_implementation.h

  #if SIMDUTF_ATOMIC_REF
size_t atomic_binary_to_base64(const char *input, size_t length, char *output,
                               base64_options options) noexcept {
  size_t retval = 0;
    #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
  // We use a smaller buffer during fuzzing to more easily detect bugs.
  constexpr size_t input_block_size = 128 * 3;
    #else
  // Arbitrary block sizes: 3KB for input which produces 4KB in output.
  constexpr size_t input_block_size = 1024 * 3;
    #endif
  std::array<char, input_block_size> inbuf;
  for (size_t i = 0; i < length; i += input_block_size) {
    const size_t current_block_size = std::min(input_block_size, length - i);
    simdutf::scalar::memcpy_atomic_read(inbuf.data(), input + i,
                                        current_block_size);
    const size_t written = binary_to_base64(inbuf.data(), current_block_size,
                                            output + retval, options);
    retval += written;
  }
  return retval;
}
  #endif // SIMDUTF_ATOMIC_REF

simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
    const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
  const auto start{utf8_output};

  while (true) {
    // convert_latin1_to_utf8 will never write more than input length * 2
    auto read_len = std::min(len, utf8_len >> 1);
    if (read_len <= 16) {
      break;
    }

    const auto write_len =
        simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);

    utf8_output += write_len;
    utf8_len -= write_len;
    buf += read_len;
    len -= read_len;
  }

  utf8_output +=
      scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);

  return utf8_output - start;
}

simdutf_warn_unused result
base64_to_binary_safe(const char *input, size_t length, char *output,
                      size_t &outlen, base64_options options,
                      last_chunk_handling_options last_chunk_handling_options,
                      bool decode_up_to_bad_char) noexcept {
  return base64_to_binary_safe_impl<char>(input, length, output, outlen,
                                          options, last_chunk_handling_options,
                                          decode_up_to_bad_char);
}
simdutf_warn_unused result
base64_to_binary_safe(const char16_t *input, size_t length, char *output,
                      size_t &outlen, base64_options options,
                      last_chunk_handling_options last_chunk_handling_options,
                      bool decode_up_to_bad_char) noexcept {
  return base64_to_binary_safe_impl<char16_t>(
      input, length, output, outlen, options, last_chunk_handling_options,
      decode_up_to_bad_char);
}

size_t binary_to_base64(const char *input, size_t length, char *output,
                        base64_options options) noexcept {
  return get_default_implementation()->binary_to_base64(input, length, output,
                                                        options);
}

size_t binary_to_base64_with_lines(const char *input, size_t length,
                                   char *output, size_t line_length,
                                   base64_options options) noexcept {
  return get_default_implementation()->binary_to_base64_with_lines(
      input, length, output, line_length, options);
}

#if SIMDUTF_USE_STATIC_INITIALIZATION
static const implementation *const builtin_impl_instance =
    get_available_implementations()[SIMDUTF_STRINGIFY(
        SIMDUTF_BUILTIN_IMPLEMENTATION)];
#endif
const implementation *builtin_implementation() {
#if !SIMDUTF_USE_STATIC_INITIALIZATION
  static const implementation *const builtin_impl_instance =
      get_available_implementations()[SIMDUTF_STRINGIFY(
          SIMDUTF_BUILTIN_IMPLEMENTATION)];
#endif
  return builtin_impl_instance;
}

simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
  return scalar::utf8::trim_partial_utf8(input, length);
}

} // namespace simdutf
/* end file src/implementation.cpp */

SIMDUTF_PUSH_DISABLE_WARNINGS
SIMDUTF_DISABLE_UNDESIRED_WARNINGS

#if SIMDUTF_IMPLEMENTATION_ARM64
/* begin file src/arm64/implementation.cpp */
/* begin file src/simdutf/arm64/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "arm64"
// #define SIMDUTF_IMPLEMENTATION arm64
/* end file src/simdutf/arm64/begin.h */
namespace simdutf {
namespace arm64 {
namespace {
#ifndef SIMDUTF_ARM64_H
  #error "arm64.h must be included"
#endif
using namespace simd;

simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
  simd8<uint8_t> bits = input.reduce_or();
  return bits.max_val() < 0b10000000u;
}

simdutf_really_inline simd8<bool>
must_be_2_3_continuation(const simd8<uint8_t> prev2,
                         const simd8<uint8_t> prev3) {
  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
  return is_third_byte ^ is_fourth_byte;
}

// common functions for utf8 conversions
simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
  // Low half contains  10cccccc|1110aaaa
  // High half contains 10bbbbbb|10bbbbbb
  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
                                                4, 4, 7, 7, 10, 10);
  #else
  const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
  #endif
  uint8x16_t perm = vqtbl1q_u8(in, sh);
  // Split into half vectors.
  // 10cccccc|1110aaaa
  uint8x8_t perm_low = vget_low_u8(perm); // no-op
  // 10bbbbbb|10bbbbbb
  uint8x8_t perm_high = vget_high_u8(perm);
  // xxxxxxxx 10bbbbbb
  uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
  // xxxxxxxx 1110aaaa
  uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
  // Assemble with shift left insert.
  // xxxxxxaa aabbbbbb
  uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
  // (perm_low << 8) | (perm_low >> 8)
  // xxxxxxxx 10cccccc
  uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
  // Shift left insert into the low bits
  // aaaabbbb bbcccccc
  uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
  return composed;
}

simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
  // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
  // Technically this calculates 8, but 6 does better and happens more often
  // (The languages which use these codepoints use ASCII spaces so 8 would need
  // to be in the middle of a very long word).

  // 10bbbbbb 110aaaaa
  uint16x8_t upper = vreinterpretq_u16_u8(in);
  // (in << 8) | (in >> 8)
  // 110aaaaa 10bbbbbb
  uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
  // 00000000 000aaaaa
  uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
  // Assemble with shift left insert.
  // 00000aaa aabbbbbb
  uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
  return composed;
}

simdutf_really_inline uint16x8_t
convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
  // This is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes.
  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
      simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
  // Shuffle
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 110aaaaa 10bbbbbb
  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
  // Mask
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000000 00bbbbbb
  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
  // 1 byte: 00000000 00000000
  // 2 byte: 000aaaaa 00000000
  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
  // Combine with a shift right accumulate
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000aaa aabbbbbb
  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
  return composed;
}

/* begin file src/arm64/arm_validate_utf32le.cpp */
const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
  const char32_t *end = input + size;

  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
  uint32x4_t currentmax = vmovq_n_u32(0x0);
  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);

  while (end - input >= 4) {
    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
    currentmax = vmaxq_u32(in, currentmax);
    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
    input += 4;
  }

  uint32x4_t is_zero =
      veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
  if (vmaxvq_u32(is_zero) != 0) {
    return nullptr;
  }

  is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
                      standardoffsetmax);
  if (vmaxvq_u32(is_zero) != 0) {
    return nullptr;
  }

  return input;
}

const result arm_validate_utf32le_with_errors(const char32_t *input,
                                              size_t size) {
  const char32_t *start = input;
  const char32_t *end = input + size;

  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
  uint32x4_t currentmax = vmovq_n_u32(0x0);
  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);

  while (end - input >= 4) {
    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
    currentmax = vmaxq_u32(in, currentmax);
    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);

    uint32x4_t is_zero =
        veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
    if (vmaxvq_u32(is_zero) != 0) {
      return result(error_code::TOO_LARGE, input - start);
    }

    is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
                        standardoffsetmax);
    if (vmaxvq_u32(is_zero) != 0) {
      return result(error_code::SURROGATE, input - start);
    }

    input += 4;
  }

  return result(error_code::SUCCESS, input - start);
}
/* end file src/arm64/arm_validate_utf32le.cpp */

/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
std::pair<const char *, char32_t *>
arm_convert_latin1_to_utf32(const char *buf, size_t len,
                            char32_t *utf32_output) {
  const char *end = buf + len;

  while (end - buf >= 16) {
    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
    uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
    uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
    uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
    uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
    uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
    uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);

    utf32_output += 16;
    buf += 16;
  }

  return std::make_pair(buf, utf32_output);
}
/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
/*
  Returns a pair: the first unprocessed byte from buf and utf8_output
  A scalar routing should carry on the conversion of the tail.
*/
std::pair<const char *, char *>
arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
                           char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char *end = latin1_input + len;
  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
  // We always write 16 bytes, of which more than the first 8 bytes
  // are valid. A safety margin of 8 is more than sufficient.
  while (end - latin1_input >= 16 + 8) {
    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
    if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
      vst1q_u8(utf8_output, in8);
      utf8_output += 16;
      latin1_input += 16;
      continue;
    }

    // We just fallback on UTF-16 code. This could be optimized/simplified
    // further.
    uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
    // 1. prepare 2-byte values
    // input 8-bit word : [aabb|bbbb] x 8
    // expected output   : [1100|00aa|10bb|bbbb] x 8
    const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
    const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);

    // t0 = [0000|00aa|bbbb|bb00]
    const uint16x8_t t0 = vshlq_n_u16(in16, 2);
    // t1 = [0000|00aa|0000|0000]
    const uint16x8_t t1 = vandq_u16(t0, v_1f00);
    // t2 = [0000|0000|00bb|bbbb]
    const uint16x8_t t2 = vandq_u16(in16, v_003f);
    // t3 = [0000|00aa|00bb|bbbb]
    const uint16x8_t t3 = vorrq_u16(t1, t2);
    // t4 = [1100|00aa|10bb|bbbb]
    const uint16x8_t t4 = vorrq_u16(t3, v_c080);
    // 2. merge ASCII and 2-byte codewords
    const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
    const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
    const uint8x16_t utf8_unpacked =
        vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
    // 3. prepare bitmask for 8-bit lookup
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
    const uint16x8_t mask = simdutf_make_uint16x8_t(
        0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
#else
    const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
                             0x0002, 0x0008, 0x0020, 0x0080};
#endif
    uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
    // 4. pack the bytes
    const uint8_t *row =
        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
    const uint8x16_t shuffle = vld1q_u8(row + 1);
    const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);

    // 5. store bytes
    vst1q_u8(utf8_output, utf8_packed);
    // 6. adjust pointers
    latin1_input += 8;
    utf8_output += row[0];

  } // while

  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
}
/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */

/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 16, usually 12).
size_t convert_masked_utf8_to_latin1(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
                                     char *&latin1_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xfff;
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.

  // We first try a few fast paths.
  // The obvious first test is ASCII, which actually consumes the full 16.
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process in chunks of 12 bytes
    vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
    latin1_output += 12; // We wrote 12 18-bit characters.
    return 12;           // We consumed 12 bytes.
  }
  /// We do not have a fast path available, or the fast path is unimportant, so
  /// we fallback.
  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];

  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];
  // this indicates an invalid input:
  if (idx >= 64) {
    return consumed;
  }
  // Here we should have (idx < 64), if not, there is a bug in the validation or
  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
  // scenario we process SIX (6) input code-code units. The max length in bytes
  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
      simdutf::tables::utf8_to_utf16::shufutf8[idx]));
  // Shuffle
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 110aaaaa 10bbbbbb
  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
  // Mask
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000000 00bbbbbb
  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
  // 1 byte: 00000000 00000000
  // 2 byte: 000aaaaa 00000000
  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
  // Combine with a shift right accumulate
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000aaa aabbbbbb
  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
  // writing 8 bytes even though we only care about the first 6 bytes.
  uint8x8_t latin1_packed = vmovn_u16(composed);
  vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
  latin1_output += 6; // We wrote 6 bytes.
  return consumed;
}
/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_out) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xFFF;
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process in chunks of 12 bytes.
    // use fast implementation in src/simdutf/arm64/simd.h
    // Ideally the compiler can keep the tables in registers.
    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
    temp.store_ascii_as_utf32_tbl(utf32_out);
    utf32_output += 12; // We wrote 12 32-bit characters.
    return 12;          // We consumed 12 bytes.
  }
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units. Convert to UTF-16
    uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
    // Zero extend and store via ST2 with a zero.
    uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
    vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return 12;         // We consumed 12 bytes.
  }

  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
  if (input_utf8_end_of_code_point_mask == 0xaaa) {
    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
    // UTF-32 code units. Convert to UTF-16
    uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
    // Zero extend and store via ST2 with a zero.
    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
    utf32_output += 6; // We wrote 6 32-bit characters.
    return 12;         // We consumed 12 bytes.
  }
  /// Either no fast path or an unimportant fast path.

  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];

  if (idx < 64) {
    // SIX (6) input code-code units
    // Convert to UTF-16
    uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
    // Zero extend and store with ST2 and zero
    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
    utf32_output += 6; // We wrote 6 32-bit characters.
    return consumed;
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
    // Shuffle
    // 1 byte: 00000000 00000000 0ccccccc
    // 2 byte: 00000000 110bbbbb 10cccccc
    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
    // Split
    // 00000000 00000000 0ccccccc
    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
    // Note: unmasked
    // xxxxxxxx aaaaxxxx xxxxxxxx
    uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
    // Use 16 bit bic instead of and.
    // The top bits will be corrected later in the bsl
    // 00000000 10bbbbbb 00000000
    uint32x4_t middle = vreinterpretq_u32_u16(
        vbicq_u16(vreinterpretq_u16_u32(perm),
                  vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
    // Combine low and middle with shift right accumulate
    // 00000000 00xxbbbb bbcccccc
    uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
    // Insert top 4 bits from high byte with bitwise select
    // 00000000 aaaabbbb bbcccccc
    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
    vst1q_u32(utf32_output, composed);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return consumed;
  } else if (idx < 209) {
    // THREE (3) input code-code units
    if (input_utf8_end_of_code_point_mask == 0x888) {
      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
      // UTF-32 code units. This uses the same method as the fixed 3 byte
      // version, reversing and shift left insert. However, there is no need for
      // a shuffle mask now, just rev16 and rev32.
      //
      // This version does not use the LUT, but 4 byte sequences are less common
      // and the overhead of the extra memory access is less important than the
      // early branch overhead in shorter sequences, so it comes last.

      // Swap pairs of bytes
      // 10dddddd|10cccccc|10bbbbbb|11110aaa
      // 10cccccc 10dddddd|11110aaa 10bbbbbb
      uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
      // Shift left and insert
      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
      uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
      // Swap 16-bit lanes
      // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
      // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
      uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
      // Shift insert again
      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
      uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
      // Clear the garbage
      // 00000000 000aaabb bbbbcccc ccdddddd
      uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
      // Store
      vst1q_u32(utf32_output, composed);

      utf32_output += 3; // We wrote 3 32-bit characters.
      return 12;         // We consumed 12 bytes.
    }
    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
    // due to surrogates no longer being involved.
    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
    // 1 byte: 00000000 00000000 00000000 0ddddddd
    // 2 byte: 00000000 00000000 110ccccc 10dddddd
    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
    // Ascii
    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
    uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
    // When converting the way we do, the 3 byte prefix will be interpreted as
    // the 18th bit being set, since the code would interpret the lead byte
    // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
    // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
    // NEON has shift right accumulate, we use that.
    //  4 byte   3 byte
    // 10bbbbbb 1110bbbb
    // 00000000 01000000 6th bit
    // 00000000 00100000 shift right
    // 10bbbbbb 0000bbbb add
    // 00bbbbbb 0000bbbb mask
    uint8x16_t correction =
        vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
    uint32x4_t corrected = vreinterpretq_u32_u8(
        vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
    // 00000000 00000000 0000cccc ccdddddd
    uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
    // Insert twice
    // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
                              vshrq_n_u32(corrected, 4));
    // 00000000 000aaabb bbbbcccc ccdddddd
    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
    // Store
    vst1q_u32(utf32_output, composed);
    utf32_output += 3; // We wrote 3 32-bit characters.
    return consumed;
  } else {
    // here we know that there is an error but we do not handle errors
    return 12;
  }
}
/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */

/* begin file src/arm64/arm_base64.cpp */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */

/**
 * Insert a line feed character in the 16-byte input at index K in [0,16).
 */
inline uint8x16_t insert_line_feed16(uint8x16_t input, size_t K) {
  static const uint8_t shuffle_masks[16][16] = {
      {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}};
  // Prepare a vector with '\n' (0x0A)
  uint8x16_t line_feed_vector = vdupq_n_u8('\n');

  // Load the precomputed shuffle mask for K
  uint8x16_t mask = vld1q_u8(shuffle_masks[K]);

  // Create a mask where 0x80 indicates the line feed position
  uint8x16_t lf_pos = vceqq_u8(mask, vdupq_n_u8(0x80));

  uint8x16_t result = vqtbl1q_u8(input, mask);

  // Use vbsl to select '\n' where lf_pos is true, else keep input bytes
  return vbslq_u8(lf_pos, line_feed_vector, result);
}

// offset is the number of characters in the current line.
// It can range from 0 to line_length (inclusive).
// If offset == line_length, we need to insert a line feed before writing
// anything.
size_t write_output_with_line_feeds(uint8_t *dst, uint8x16_t src,
                                    size_t line_length, size_t &offset) {
  // Fast path: no need to insert line feeds
  // If we are at offset, we would write from [offset, offset + 16).
  // We need that line_length >= offset + 16.
  if (offset + 16 <= line_length) {
    // No need to insert line feeds
    vst1q_u8(dst, src);
    offset += 16; // offset could be line_length here.
    return 16;
  }

  // We have that offset + 16 >= line_length
  // the common case is that line_length is greater than 16
  if (simdutf_likely(line_length >= 16)) {
    // offset <= line_length.
    // offset + 16 > line_length
    // So line_length - offset < 16
    // and line_length - offset >= 0
    uint8x16_t chunk = insert_line_feed16(src, line_length - offset);
    vst1q_u8(dst, chunk);
    // Not ideal to pull the last element and write it separately but
    // it simplifies the code.
    *(dst + 16) = vgetq_lane_u8(src, 15);
    offset += 16 - line_length;
    return 16 + 1; // we wrote 16 bytes plus one line feed
  }
  // Uncommon case where line_length < 16
  // This is going to be SLOW.
  else {
    uint8_t buffer[16];
    vst1q_u8(buffer, src);
    size_t out_pos = 0;
    size_t local_offset = offset;
    for (size_t i = 0; i < 16;) {
      if (local_offset == line_length) {
        dst[out_pos++] = '\n';
        local_offset = 0;
      }
      dst[out_pos++] = buffer[i++];
      local_offset++;
    }
    offset = local_offset;
    return out_pos;
  }
}

template <bool insert_line_feeds>
size_t encode_base64_impl(char *dst, const char *src, size_t srclen,
                          base64_options options,
                          size_t line_length = simdutf::default_line_length) {
  size_t offset = 0;
  if (line_length < 4) {
    line_length = 4; // We do not support line_length less than 4
  }
  // credit: Wojciech Muła
  uint8_t *out = (uint8_t *)dst;
  constexpr static uint8_t source_table[64] = {
      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
      'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
  };
  constexpr static uint8_t source_table_url[64] = {
      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
  };
  const uint8x16_t v3f = vdupq_n_u8(0x3f);
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  // When trying to load a uint8_t array, Visual Studio might
  // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
  // cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
  const uint8x16x4_t table = vld4q_u8(
      (reinterpret_cast<const char *>(options & base64_url) ? source_table_url
                                                            : source_table));
#else
  const uint8x16x4_t table =
      vld4q_u8((options & base64_url) ? source_table_url : source_table);
#endif
  size_t i = 0;
  for (; i + 16 * 3 <= srclen; i += 16 * 3) {
    const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
    uint8x16x4_t result;
    result.val[0] = vshrq_n_u8(in.val[0], 2);
    result.val[1] =
        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
    result.val[2] =
        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
    result.val[3] = vandq_u8(in.val[2], v3f);
    result.val[0] = vqtbl4q_u8(table, result.val[0]);
    result.val[1] = vqtbl4q_u8(table, result.val[1]);
    result.val[2] = vqtbl4q_u8(table, result.val[2]);
    result.val[3] = vqtbl4q_u8(table, result.val[3]);
    if (insert_line_feeds) {
      if (line_length >= 64) { // fast path
        vst4q_u8(out, result);
        if (offset + 64 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 64 - location_end;
          std::memmove(out + location_end + 1, out + location_end, to_move);
          out[location_end] = '\n';
          offset = to_move;
          out += 64 + 1;
        } else {
          offset += 64;
          out += 64;
        }
      } else { // slow path
        uint8x16x2_t Z0 = vzipq_u8(result.val[0], result.val[1]);
        uint8x16x2_t Z1 = vzipq_u8(result.val[2], result.val[3]);
        uint16x8x2_t Z2 = vzipq_u16(vreinterpretq_u16_u8(Z0.val[0]),
                                    vreinterpretq_u16_u8(Z1.val[0]));
        uint16x8x2_t Z3 = vzipq_u16(vreinterpretq_u16_u8(Z0.val[1]),
                                    vreinterpretq_u16_u8(Z1.val[1]));
        uint8x16_t T0 = vreinterpretq_u8_u16(Z2.val[0]);
        uint8x16_t T1 = vreinterpretq_u8_u16(Z2.val[1]);
        uint8x16_t T2 = vreinterpretq_u8_u16(Z3.val[0]);
        uint8x16_t T3 = vreinterpretq_u8_u16(Z3.val[1]);
        out += write_output_with_line_feeds(out, T0, line_length, offset);
        out += write_output_with_line_feeds(out, T1, line_length, offset);
        out += write_output_with_line_feeds(out, T2, line_length, offset);
        out += write_output_with_line_feeds(out, T3, line_length, offset);
      }
    } else {
      vst4q_u8(out, result);
      out += 64;
    }
  }

  if (i + 24 <= srclen) {
    const uint8x8_t v3f_d = vdup_n_u8(0x3f);
    const uint8x8x3_t in = vld3_u8((const uint8_t *)src + i);
    uint8x8x4_t result;
    result.val[0] = vshr_n_u8(in.val[0], 2);
    result.val[1] =
        vand_u8(vsli_n_u8(vshr_n_u8(in.val[1], 4), in.val[0], 4), v3f_d);
    result.val[2] =
        vand_u8(vsli_n_u8(vshr_n_u8(in.val[2], 6), in.val[1], 2), v3f_d);
    result.val[3] = vand_u8(in.val[2], v3f_d);
    result.val[0] = vqtbl4_u8(table, result.val[0]);
    result.val[1] = vqtbl4_u8(table, result.val[1]);
    result.val[2] = vqtbl4_u8(table, result.val[2]);
    result.val[3] = vqtbl4_u8(table, result.val[3]);
    if (insert_line_feeds) {
      if (line_length >= 32) { // fast path
        vst4_u8(out, result);
        if (offset + 32 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 32 - location_end;
          std::memmove(out + location_end + 1, out + location_end, to_move);
          out[location_end] = '\n';
          offset = to_move;
          out += 32 + 1;
        } else {
          offset += 32;
          out += 32;
        }
      } else { // slow path
        uint8x8x2_t Z0 = vzip_u8(result.val[0], result.val[1]);
        uint8x8x2_t Z1 = vzip_u8(result.val[2], result.val[3]);
        uint16x4x2_t Z2 = vzip_u16(vreinterpret_u16_u8(Z0.val[0]),
                                   vreinterpret_u16_u8(Z1.val[0]));
        uint16x4x2_t Z3 = vzip_u16(vreinterpret_u16_u8(Z0.val[1]),
                                   vreinterpret_u16_u8(Z1.val[1]));
        uint8x8_t T0 = vreinterpret_u8_u16(Z2.val[0]);
        uint8x8_t T1 = vreinterpret_u8_u16(Z2.val[1]);
        uint8x8_t T2 = vreinterpret_u8_u16(Z3.val[0]);
        uint8x8_t T3 = vreinterpret_u8_u16(Z3.val[1]);
        uint8x16_t TT0 = vcombine_u8(T0, T1);
        uint8x16_t TT1 = vcombine_u8(T2, T3);
        out += write_output_with_line_feeds(out, TT0, line_length, offset);
        out += write_output_with_line_feeds(out, TT1, line_length, offset);
      }
    } else {
      vst4_u8(out, result);
      out += 32;
    }
    i += 24;
  }
  out += scalar::base64::tail_encode_base64_impl<insert_line_feeds>(
      (char *)out, src + i, srclen - i, options, line_length, offset);
  return size_t((char *)out - dst);
}

size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {
  return encode_base64_impl<false>(dst, src, srclen, options);
}

static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
  if (mask == 0) {
    vst1q_u8((uint8_t *)output, data);
    return;
  }
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
  uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
                               tables::base64::thintable_epi8[mask2]};
  uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  const uint8x16_t off =
      simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
#else
  const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
#endif

  compactmask = vaddq_u8(compactmask, off);
  uint8x16_t pruned = vqtbl1q_u8(data, compactmask);

  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
  // then load the corresponding mask, what it does is to write
  // only the first pop1 bytes from the first 8 bytes, and then
  // it fills in with the bytes from the second 8 bytes + some filling
  // at the end.
  compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
  vst1q_u8((uint8_t *)output, answer);
}

struct block64 {
  uint8x16_t chunks[4];
};

static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
template <bool base64_url, bool default_or_url>
uint64_t to_base64_mask(block64 *b, bool *error) {
  uint8x16_t v0f = vdupq_n_u8(0xf);
  uint8x16_t v01 = vdupq_n_u8(0x1);

  uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
  uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
  uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
  uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);

  // Needed by the decoding step.
  uint8x16_t hi_bits0 = vshrq_n_u8(b->chunks[0], 3);
  uint8x16_t hi_bits1 = vshrq_n_u8(b->chunks[1], 3);
  uint8x16_t hi_bits2 = vshrq_n_u8(b->chunks[2], 3);
  uint8x16_t hi_bits3 = vshrq_n_u8(b->chunks[3], 3);
  uint8x16_t lut_lo;
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  if (default_or_url) {
    lut_lo =
        simdutf_make_uint8x16_t(0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
                                0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa5, 0xa0, 0xa6);
  } else if (base64_url) {
    lut_lo =
        simdutf_make_uint8x16_t(0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
                                0xf8, 0xf9, 0xf1, 0xa0, 0xa1, 0xa5, 0xa0, 0xa2);
  } else {
    lut_lo =
        simdutf_make_uint8x16_t(0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
                                0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa1, 0xa0, 0xa4);
  }
#else
  if (default_or_url) {
    lut_lo = uint8x16_t{0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
                        0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa5, 0xa0, 0xa6};
  } else if (base64_url) {
    lut_lo = uint8x16_t{0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
                        0xf8, 0xf9, 0xf1, 0xa0, 0xa1, 0xa5, 0xa0, 0xa2};
  } else {
    lut_lo = uint8x16_t{0xa9, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
                        0xf8, 0xf9, 0xf1, 0xa2, 0xa1, 0xa1, 0xa0, 0xa4};
  }
#endif
  uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
  uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
  uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
  uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
  uint8x16_t lut_hi;
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  if (default_or_url) {
    lut_hi =
        simdutf_make_uint8x16_t(0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, 0x10,
                                0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40);
  } else if (base64_url) {
    lut_hi =
        simdutf_make_uint8x16_t(0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, 0x10,
                                0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40);
  } else {
    lut_hi =
        simdutf_make_uint8x16_t(0x0, 0x1, 0x0, 0x0, 0x1, 0x6, 0x8, 0x8, 0x10,
                                0x20, 0x20, 0x10, 0x40, 0x80, 0x80, 0x40);
  }
#else
  if (default_or_url) {
    lut_hi = uint8x16_t{0x0,  0x1,  0x0,  0x0,  0x1,  0x6,  0x8,  0x8,
                        0x10, 0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40};
  } else if (base64_url) {
    lut_hi = uint8x16_t{0x0,  0x1,  0x0,  0x0,  0x1,  0x4,  0x8,  0x8,
                        0x10, 0x20, 0x20, 0x12, 0x40, 0x80, 0x80, 0x40};
  } else {
    lut_hi = uint8x16_t{0x0,  0x1,  0x0,  0x0,  0x1,  0x6,  0x8,  0x8,
                        0x10, 0x20, 0x20, 0x10, 0x40, 0x80, 0x80, 0x40};
  }
#endif
  uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_bits0);
  uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_bits1);
  uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_bits2);
  uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_bits3);

  // maps error byte to 0 and space byte to 1, valid bytes are >1
  uint8x16_t res0 = vandq_u8(lo0, hi0);
  uint8x16_t res1 = vandq_u8(lo1, hi1);
  uint8x16_t res2 = vandq_u8(lo2, hi2);
  uint8x16_t res3 = vandq_u8(lo3, hi3);

  uint8_t checks =
      vminvq_u8(vminq_u8(vminq_u8(res0, res1), vminq_u8(res2, res3)));
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  const uint8x16_t bit_mask =
      simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
#else
  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
#endif
  uint64_t badcharmask = 0;
  *error = checks == 0;
  if (checks <= 1) {
    // Add each of the elements next to each other, successively, to stuff each
    // 8 byte mask into one.
    uint8x16_t test0 = vcleq_u8(res0, v01);
    uint8x16_t test1 = vcleq_u8(res1, v01);
    uint8x16_t test2 = vcleq_u8(res2, v01);
    uint8x16_t test3 = vcleq_u8(res3, v01);
    uint8x16_t sum0 =
        vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
    uint8x16_t sum1 =
        vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
    sum0 = vpaddq_u8(sum0, sum1);
    sum0 = vpaddq_u8(sum0, sum0);
    badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
  }
  // This is the transformation step that can be done while we are waiting for
  // sum0
  uint8x16_t roll_lut;
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  if (default_or_url) {
    roll_lut =
        simdutf_make_uint8x16_t(0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9,
                                0xB9, 0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9);
  } else if (base64_url) {
    roll_lut =
        simdutf_make_uint8x16_t(0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x11, 0xE0, 0x00,
                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
  } else {
    roll_lut =
        simdutf_make_uint8x16_t(0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x10, 0x13, 0x00,
                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
  }
#else
  if (default_or_url) {
    roll_lut = uint8x16_t{0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9,
                          0xB9, 0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9};
  } else if (base64_url) {
    roll_lut = uint8x16_t{0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x11, 0xE0, 0x00,
                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
  } else {
    roll_lut = uint8x16_t{0xB9, 0xB9, 0xBF, 0xBF, 0x04, 0x10, 0x13, 0x00,
                          0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
  }
#endif
  uint8x16_t roll0, roll1, roll2, roll3;
  if (default_or_url) {
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
    const uint8x16_t delta_asso =
        simdutf_make_uint8x16_t(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                                0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
#else
    const uint8x16_t delta_asso =
        uint8x16_t{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                   0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16};
#endif
    // the logic of translating is based on westmere
    uint8x16_t delta_hash0 =
        vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles0), hi_bits0);
    uint8x16_t delta_hash1 =
        vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles1), hi_bits1);
    uint8x16_t delta_hash2 =
        vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles2), hi_bits2);
    uint8x16_t delta_hash3 =
        vrhaddq_u8(vqtbl1q_u8(delta_asso, lo_nibbles3), hi_bits3);
    const uint8x16x2_t roll_lut_2 = {roll_lut, roll_lut};
    roll0 = vqtbl2q_u8(roll_lut_2, delta_hash0);
    roll1 = vqtbl2q_u8(roll_lut_2, delta_hash1);
    roll2 = vqtbl2q_u8(roll_lut_2, delta_hash2);
    roll3 = vqtbl2q_u8(roll_lut_2, delta_hash3);
  } else {
    uint8x16_t delta_hash0 = vclzq_u8(res0);
    uint8x16_t delta_hash1 = vclzq_u8(res1);
    uint8x16_t delta_hash2 = vclzq_u8(res2);
    uint8x16_t delta_hash3 = vclzq_u8(res3);
    roll0 = vqtbl1q_u8(roll_lut, delta_hash0);
    roll1 = vqtbl1q_u8(roll_lut, delta_hash1);
    roll2 = vqtbl1q_u8(roll_lut, delta_hash2);
    roll3 = vqtbl1q_u8(roll_lut, delta_hash3);
  }

  b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
  b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
  b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
  b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
  return badcharmask;
}

void copy_block(block64 *b, char *output) {
  vst1q_u8((uint8_t *)output, b->chunks[0]);
  vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
  vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
  vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
}

uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
  uint64_t popcounts =
      vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
  uint64_t offsets = popcounts * 0x0101010101010101;
  compress(b->chunks[0], uint16_t(mask), output);
  compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
  compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
  compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
  return offsets >> 56;
}

// The caller of this function is responsible to ensure that there are 64 bytes
// available from reading at src. The data is read into a block64 structure.
void load_block(block64 *b, const char *src) {
  b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
  b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
  b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
  b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
}

// The caller of this function is responsible to ensure that there are 32 bytes
// available from reading at data. It returns a 16-byte value, narrowing with
// saturation the 16-bit words.
inline uint8x16_t load_satured(const uint16_t *data) {
  uint16x8_t in1 = vld1q_u16(data);
  uint16x8_t in2 = vld1q_u16(data + 8);
  return vqmovn_high_u16(vqmovn_u16(in1), in2);
}

// The caller of this function is responsible to ensure that there are 128 bytes
// available from reading at src. The data is read into a block64 structure.
void load_block(block64 *b, const char16_t *src) {
  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
}

// decode 64 bytes and output 48 bytes
void base64_decode_block(char *out, const char *src) {
  uint8x16x4_t str = vld4q_u8((uint8_t *)src);
  uint8x16x3_t outvec;
  outvec.val[0] = vsliq_n_u8(vshrq_n_u8(str.val[1], 4), str.val[0], 2);
  outvec.val[1] = vsliq_n_u8(vshrq_n_u8(str.val[2], 2), str.val[1], 4);
  outvec.val[2] = vsliq_n_u8(str.val[3], str.val[2], 6);
  vst3q_u8((uint8_t *)out, outvec);
}

static size_t compress_block_single(block64 *b, uint64_t mask, char *output) {
  const size_t pos64 = trailing_zeroes(mask);
  const int8_t pos = pos64 & 0xf;

  // Predefine the index vector
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
  const uint8x16_t v1 = simdutf_make_uint8x16_t(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                10, 11, 12, 13, 14, 15);
#else  // SIMDUTF_REGULAR_VISUAL_STUDIO
  const uint8x16_t v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
#endif // SIMDUTF_REGULAR_VISUAL_STUDIO

  switch (pos64 >> 4) {
  case 0b00: {
    const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
    const uint8x16_t v2 =
        vcgtq_s8(vreinterpretq_s8_u8(v1),
                 vreinterpretq_s8_u8(v0));  // Compare greater than
    const uint8x16_t sh = vsubq_u8(v1, v2); // Subtract
    const uint8x16_t compressed =
        vqtbl1q_u8(b->chunks[0], sh); // Table lookup (shuffle)

    vst1q_u8((uint8_t *)(output + 0 * 16), compressed);
    vst1q_u8((uint8_t *)(output + 1 * 16 - 1), b->chunks[1]);
    vst1q_u8((uint8_t *)(output + 2 * 16 - 1), b->chunks[2]);
    vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]);
  } break;

  case 0b01: {
    vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]);

    const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
    const uint8x16_t v2 =
        vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0));
    const uint8x16_t sh = vsubq_u8(v1, v2);
    const uint8x16_t compressed = vqtbl1q_u8(b->chunks[1], sh);

    vst1q_u8((uint8_t *)(output + 1 * 16), compressed);
    vst1q_u8((uint8_t *)(output + 2 * 16 - 1), b->chunks[2]);
    vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]);
  } break;

  case 0b10: {
    vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]);
    vst1q_u8((uint8_t *)(output + 1 * 16), b->chunks[1]);

    const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
    const uint8x16_t v2 =
        vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0));
    const uint8x16_t sh = vsubq_u8(v1, v2);
    const uint8x16_t compressed = vqtbl1q_u8(b->chunks[2], sh);

    vst1q_u8((uint8_t *)(output + 2 * 16), compressed);
    vst1q_u8((uint8_t *)(output + 3 * 16 - 1), b->chunks[3]);
  } break;

  case 0b11: {
    vst1q_u8((uint8_t *)(output + 0 * 16), b->chunks[0]);
    vst1q_u8((uint8_t *)(output + 1 * 16), b->chunks[1]);
    vst1q_u8((uint8_t *)(output + 2 * 16), b->chunks[2]);

    const uint8x16_t v0 = vmovq_n_u8((uint8_t)(pos - 1));
    const uint8x16_t v2 =
        vcgtq_s8(vreinterpretq_s8_u8(v1), vreinterpretq_s8_u8(v0));
    const uint8x16_t sh = vsubq_u8(v1, v2);
    const uint8x16_t compressed = vqtbl1q_u8(b->chunks[3], sh);

    vst1q_u8((uint8_t *)(output + 3 * 16), compressed);
  } break;
  }
  return 63;
}

template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }

template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename char_type>
full_result
compress_decode_base64(char *dst, const char_type *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t equalsigns = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && equalsigns > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0};
    }
    return {SUCCESS, full_input_length, 0};
  }
  const char_type *const srcinit = src;
  const char *const dstinit = dst;
  const char_type *const srcend = src + srclen;

  constexpr size_t block_size = 10;
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const char_type *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b;
      load_block(&b, src);
      src += 64;
      bool error = false;
      uint64_t badcharmask =
          to_base64_mask<base64_url, default_or_url>(&b, &error);
      if (badcharmask) {
        if (error && !ignore_garbage) {
          src -= 64;
          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
                 to_base64[uint8_t(*src)] <= 64) {
            src++;
          }
          if (src < srcend) {
            // should never happen
          }
          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                  size_t(dst - dstinit)};
        }
      }

      if (badcharmask != 0) {
        // optimization opportunity: check for simple masks like those made of
        // continuous 1s followed by continuous 0s. And masks containing a
        // single bad character.
        if (is_power_of_two(badcharmask)) {
          bufferptr += compress_block_single(&b, badcharmask, bufferptr);
        } else {
          bufferptr += compress_block(&b, badcharmask, bufferptr);
        }
      } else {
        // optimization opportunity: if bufferptr == buffer and mask == 0, we
        // can avoid the call to compress_block and decode directly.
        copy_block(&b, bufferptr);
        bufferptr += 64;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 1); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }
  char *buffer_start = buffer;
  // Optimization note: if this is almost full, then it is worth our
  // time, otherwise, we should just decode directly.
  int last_block = (int)((bufferptr - buffer_start) % 64);
  if (last_block != 0 && srcend - src + last_block >= 64) {
    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
      uint8_t val = to_base64[uint8_t(*src)];
      *bufferptr = char(val);
      if ((!scalar::base64::is_eight_byte(*src) || val > 64) &&
          !ignore_garbage) {
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      bufferptr += (val <= 63);
      src++;
    }
  }

  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    base64_decode_block(dst, buffer_start);
    dst += 48;
  }
  if ((bufferptr - buffer_start) % 64 != 0) {
    while (buffer_start + 4 < bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 4);

      dst += 3;
      buffer_start += 4;
    }
    if (buffer_start + 4 <= bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
    // backtrack
    int leftover = int(bufferptr - buffer_start);
    while (leftover > 0) {
      if (!ignore_garbage) {
        while (to_base64[uint8_t(*(src - 1))] == 64) {
          src--;
        }
      } else {
        while (to_base64[uint8_t(*(src - 1))] >= 64) {
          src--;
        }
      }
      src--;
      leftover--;
    }
  }
  if (src < srcend + equalsigns) {
    full_result r = scalar::base64::base64_tail_decode(
        dst, src, srcend - src, equalsigns, options, last_chunk_options);
    r = scalar::base64::patch_tail_result(
        r, size_t(src - srcinit), size_t(dst - dstinit), equallocation,
        full_input_length, last_chunk_options);
    // When is_partial(last_chunk_options) is true, we must either end with
    // the end of the stream (beyond whitespace) or right after a non-ignorable
    // character or at the very beginning of the stream.
    // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
        r.input_count < full_input_length) {
      // First check if we can extend the input to the end of the stream
      while (r.input_count < full_input_length &&
             base64_ignorable(*(srcinit + r.input_count), options)) {
        r.input_count++;
      }
      // If we are still not at the end of the stream, then we must backtrack
      // to the last non-ignorable character.
      if (r.input_count < full_input_length) {
        while (r.input_count > 0 &&
               base64_ignorable(*(srcinit + r.input_count - 1), options)) {
          r.input_count--;
        }
      }
    }
    return r;
  }
  if (equalsigns > 0 && !ignore_garbage) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}
/* end file src/arm64/arm_base64.cpp */
/* begin file src/arm64/arm_find.cpp */
simdutf_really_inline const char *util_find(const char *start, const char *end,
                                            char character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;

  const size_t widestep = 64;
  const size_t step = 16;
  uint8x16_t char_vec = vdupq_n_u8(static_cast<uint8_t>(character));

  // Handle unaligned beginning
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % step;
  if (misalignment != 0) {
    size_t adjustment = step - misalignment;
    if (size_t(end - start) < adjustment) {
      adjustment = end - start;
    }
    for (size_t i = 0; i < adjustment; ++i) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for full 64-byte chunks
  while (size_t(end - start) >= widestep) {
    uint8x16_t data1 = vld1q_u8(reinterpret_cast<const uint8_t *>(start));
    uint8x16_t data2 = vld1q_u8(reinterpret_cast<const uint8_t *>(start) + 16);
    uint8x16_t data3 = vld1q_u8(reinterpret_cast<const uint8_t *>(start) + 32);
    uint8x16_t data4 = vld1q_u8(reinterpret_cast<const uint8_t *>(start) + 48);

    uint8x16_t cmp1 = vceqq_u8(data1, char_vec);
    uint8x16_t cmp2 = vceqq_u8(data2, char_vec);
    uint8x16_t cmp3 = vceqq_u8(data3, char_vec);
    uint8x16_t cmp4 = vceqq_u8(data4, char_vec);
    uint8x16_t cmpall = vorrq_u8(vorrq_u8(cmp1, cmp2), vorrq_u8(cmp3, cmp4));

    uint64_t mask = vget_lane_u64(
        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmpall), 4)), 0);

    if (mask != 0) {
      // Found a match, return the first one
      uint64_t mask1 = vget_lane_u64(
          vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp1), 4)), 0);
      if (mask1 != 0) {
        // Found a match in the first chunk
        int index = trailing_zeroes(mask1) / 4; // Each character maps to 4 bits
        return start + index;
      }
      uint64_t mask2 = vget_lane_u64(
          vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp2), 4)), 0);
      if (mask2 != 0) {
        // Found a match in the second chunk
        int index = trailing_zeroes(mask2) / 4; // Each character maps to 4 bits
        return start + index + 16;
      }
      uint64_t mask3 = vget_lane_u64(
          vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp3), 4)), 0);
      if (mask3 != 0) {
        // Found a match in the third chunk
        int index = trailing_zeroes(mask3) / 4; // Each character maps to 4 bits
        return start + index + 32;
      }
      uint64_t mask4 = vget_lane_u64(
          vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp4), 4)), 0);
      if (mask4 != 0) {
        // Found a match in the fourth chunk
        int index = trailing_zeroes(mask4) / 4; // Each character maps to 4 bits
        return start + index + 48;
      }
    }

    start += widestep;
  }

  // Main loop for full 16-byte chunks
  while (size_t(end - start) >= step) {
    uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(start));
    uint8x16_t cmp = vceqq_u8(data, char_vec);
    uint64_t mask = vget_lane_u64(
        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4)), 0);

    if (mask != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(mask) / 4; // Each character maps to 4 bits
      return start + index;
    }

    start += step;
  }

  // Handle remaining bytes with scalar loop
  for (; start < end; ++start) {
    if (*start == character) {
      return start;
    }
  }

  return end;
}

simdutf_really_inline const char16_t *util_find(const char16_t *start,
                                                const char16_t *end,
                                                char16_t character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;

  const size_t step = 8;
  uint16x8_t char_vec = vdupq_n_u16(character);

  // Handle unaligned beginning
  uintptr_t misalignment =
      reinterpret_cast<uintptr_t>(start) % (step * sizeof(char16_t));
  if (misalignment != 0 && misalignment % 2 == 0) {
    size_t adjustment =
        (step * sizeof(char16_t) - misalignment) / sizeof(char16_t);
    if (size_t(end - start) < adjustment) {
      adjustment = end - start;
    }
    for (size_t i = 0; i < adjustment; ++i) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for full 8-element chunks with unrolling
  while (size_t(end - start) >= 4 * step) {
    uint16x8_t data1 = vld1q_u16(reinterpret_cast<const uint16_t *>(start));
    uint16x8_t data2 =
        vld1q_u16(reinterpret_cast<const uint16_t *>(start) + step);
    uint16x8_t data3 =
        vld1q_u16(reinterpret_cast<const uint16_t *>(start) + 2 * step);
    uint16x8_t data4 =
        vld1q_u16(reinterpret_cast<const uint16_t *>(start) + 3 * step);

    uint16x8_t cmp1 = vceqq_u16(data1, char_vec);
    uint16x8_t cmp2 = vceqq_u16(data2, char_vec);
    uint16x8_t cmp3 = vceqq_u16(data3, char_vec);
    uint16x8_t cmp4 = vceqq_u16(data4, char_vec);

    uint64_t mask1 = vget_lane_u64(
        vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp1), 4)), 0);
    if (mask1 != 0) {
      int index = trailing_zeroes(mask1) / 8;
      return start + index;
    }

    uint64_t mask2 = vget_lane_u64(
        vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp2), 4)), 0);
    if (mask2 != 0) {
      int index = trailing_zeroes(mask2) / 8;
      return start + index + step;
    }

    uint64_t mask3 = vget_lane_u64(
        vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp3), 4)), 0);
    if (mask3 != 0) {
      int index = trailing_zeroes(mask3) / 8;
      return start + index + 2 * step;
    }

    uint64_t mask4 = vget_lane_u64(
        vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp4), 4)), 0);
    if (mask4 != 0) {
      int index = trailing_zeroes(mask4) / 8;
      return start + index + 3 * step;
    }

    start += 4 * step;
  }

  // Main loop for full 8-element chunks
  while (size_t(end - start) >= step) {
    uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(start));
    uint16x8_t cmp = vceqq_u16(data, char_vec);
    uint64_t mask = vget_lane_u64(
        vreinterpret_u64_u16(vshrn_n_u32(vreinterpretq_u32_u16(cmp), 4)), 0);

    if (mask != 0) {
      int index = trailing_zeroes(mask) / 8;
      return start + index;
    }

    start += step;
  }

  // Handle remaining elements with scalar loop
  for (; start < end; ++start) {
    if (*start == character) {
      return start;
    }
  }

  return end;
}
/* end file src/arm64/arm_find.cpp */
/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
std::pair<const char32_t *, char *>
arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
                            char *latin1_output) {
  const char32_t *end = buf + len;
  while (end - buf >= 8) {
    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));

    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
    if (vmaxvq_u16(utf16_packed) <= 0xff) {
      // 1. pack the bytes
      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
      // 2. store (8 bytes)
      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
      // 3. adjust pointers
      buf += 8;
      latin1_output += 8;
    } else {
      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
    }
  } // while
  return std::make_pair(buf, latin1_output);
}

std::pair<result, char *>
arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                        char *latin1_output) {
  const char32_t *start = buf;
  const char32_t *end = buf + len;

  while (end - buf >= 8) {
    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));

    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));

    if (vmaxvq_u16(utf16_packed) <= 0xff) {
      // 1. pack the bytes
      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
      // 2. store (8 bytes)
      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
      // 3. adjust pointers
      buf += 8;
      latin1_output += 8;
    } else {
      // Let us do a scalar fallback.
      for (int k = 0; k < 8; k++) {
        uint32_t word = buf[k];
        if (word <= 0xff) {
          *latin1_output++ = char(word);
        } else {
          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                latin1_output);
        }
      }
    }
  } // while
  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        latin1_output);
}
/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
std::pair<const char32_t *, char *>
arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char32_t *end = buf + len;

  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);

  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (buf + 16 + safety_margin < end) {
    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));

    // Check if no bits set above 16th
    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
        // 1. pack the bytes
        // obviously suboptimal.
        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
        // 2. store (8 bytes)
        vst1_u8(utf8_output, utf8_packed);
        // 3. adjust pointers
        buf += 8;
        utf8_output += 8;
        continue; // we are done for this round!
      }

      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
        // 1. prepare 2-byte values
        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
        // expected output   : [110a|aaaa|10bb|bbbb] x 8
        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);

        // t0 = [000a|aaaa|bbbb|bb00]
        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
        // t1 = [000a|aaaa|0000|0000]
        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
        // t2 = [0000|0000|00bb|bbbb]
        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
        // t3 = [000a|aaaa|00bb|bbbb]
        const uint16x8_t t3 = vorrq_u16(t1, t2);
        // t4 = [110a|aaaa|10bb|bbbb]
        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
        // 2. merge ASCII and 2-byte codewords
        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
        // 3. prepare bitmask for 8-bit lookup
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
        const uint16x8_t mask = simdutf_make_uint16x8_t(
            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
#else
        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
                                 0x0002, 0x0008, 0x0020, 0x0080};
#endif
        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
        // 4. pack the bytes
        const uint8_t *row =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
        const uint8x16_t shuffle = vld1q_u8(row + 1);
        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);

        // 5. store bytes
        vst1q_u8(utf8_output, utf8_packed);

        // 6. adjust pointers
        buf += 8;
        utf8_output += row[0];
        continue;
      } else {
        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
        forbidden_bytemask =
            vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
                                vcgeq_u16(utf16_packed, v_d800)),
                      forbidden_bytemask);

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
#else
        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
#endif
        /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
          single UFT-8 byte
          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
          two UTF-8 bytes
          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
          three UTF-8 bytes

          We expand the input word (16-bit) into two code units (32-bit), thus
          we have room for four bytes. However, we need five distinct bit
          layouts. Note that the last byte in cases #2 and #3 is the same.

          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
          in register t2.

          We precompute byte 1 for case #3 and -- **conditionally** --
          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
          they differ by exactly one bit.

          Finally from these two code units we build proper UTF-8 sequence,
          taking into account the case (i.e, the number of bytes to write).
        */
        /**
         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
         * t2 => [0ccc|cccc] [10cc|cccc]
         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
         */
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
        const uint16x8_t t0 =
            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
                                            vreinterpretq_u8_u16(dup_even)));
        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));

        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
        const uint16x8_t s1 =
            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
        // [00bb|bbbb|0000|aaaa]
        const uint16x8_t s2 = vorrq_u16(s0, s1s);
        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
        const uint16x8_t one_or_two_bytes_bytemask =
            vcleq_u16(utf16_packed, v_07ff);
        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
                                        one_or_two_bytes_bytemask);
        const uint16x8_t s4 = veorq_u16(s3, m0);
#undef simdutf_vec

        // 4. expand code units 16-bit => 32-bit
        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));

        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
        const uint16x8_t onemask = simdutf_make_uint16x8_t(
            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
        const uint16x8_t twomask = simdutf_make_uint16x8_t(
            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
#else
        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
                                    0x0100, 0x0400, 0x1000, 0x4000};
        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
                                    0x0200, 0x0800, 0x2000, 0x8000};
#endif
        const uint16x8_t combined =
            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
                      vandq_u16(one_or_two_bytes_bytemask, twomask));
        const uint16_t mask = vaddvq_u16(combined);
        // The following fast path may or may not be beneficial.
        /*if(mask == 0) {
          // We only have three-byte code units. Use fast path.
          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
          vst1q_u8(utf8_output, utf8_0);
          utf8_output += 12;
          vst1q_u8(utf8_output, utf8_1);
          utf8_output += 12;
          buf += 8;
          continue;
        }*/
        const uint8_t mask0 = uint8_t(mask);
        const uint8_t *row0 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);

        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);

        vst1q_u8(utf8_output, utf8_0);
        utf8_output += row0[0];
        vst1q_u8(utf8_output, utf8_1);
        utf8_output += row1[0];

        buf += 8;
      }
      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes.
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(nullptr,
                                  reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(nullptr,
                                  reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  // check for invalid input
  if (vmaxvq_u16(forbidden_bytemask) != 0) {
    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
  }
  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
}

std::pair<result, char *>
arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
                                      char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char32_t *start = buf;
  const char32_t *end = buf + len;

  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (buf + 16 + safety_margin < end) {
    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));

    // Check if no bits set above 16th
    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
        // 1. pack the bytes
        // obviously suboptimal.
        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
        // 2. store (8 bytes)
        vst1_u8(utf8_output, utf8_packed);
        // 3. adjust pointers
        buf += 8;
        utf8_output += 8;
        continue; // we are done for this round!
      }

      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
        // 1. prepare 2-byte values
        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
        // expected output   : [110a|aaaa|10bb|bbbb] x 8
        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);

        // t0 = [000a|aaaa|bbbb|bb00]
        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
        // t1 = [000a|aaaa|0000|0000]
        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
        // t2 = [0000|0000|00bb|bbbb]
        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
        // t3 = [000a|aaaa|00bb|bbbb]
        const uint16x8_t t3 = vorrq_u16(t1, t2);
        // t4 = [110a|aaaa|10bb|bbbb]
        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
        // 2. merge ASCII and 2-byte codewords
        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
        // 3. prepare bitmask for 8-bit lookup
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
        const uint16x8_t mask = simdutf_make_uint16x8_t(
            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
#else
        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
                                 0x0002, 0x0008, 0x0020, 0x0080};
#endif
        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
        // 4. pack the bytes
        const uint8_t *row =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
        const uint8x16_t shuffle = vld1q_u8(row + 1);
        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);

        // 5. store bytes
        vst1q_u8(utf8_output, utf8_packed);

        // 6. adjust pointers
        buf += 8;
        utf8_output += row[0];
        continue;
      } else {
        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes

        // check for invalid input
        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
        const uint16x8_t forbidden_bytemask = vandq_u16(
            vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
        if (vmaxvq_u16(forbidden_bytemask) != 0) {
          return std::make_pair(result(error_code::SURROGATE, buf - start),
                                reinterpret_cast<char *>(utf8_output));
        }

#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
#else
        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
#endif
        /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
          single UFT-8 byte
          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
          two UTF-8 bytes
          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
          three UTF-8 bytes

          We expand the input word (16-bit) into two code units (32-bit), thus
          we have room for four bytes. However, we need five distinct bit
          layouts. Note that the last byte in cases #2 and #3 is the same.

          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
          in register t2.

          We precompute byte 1 for case #3 and -- **conditionally** --
          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
          they differ by exactly one bit.

          Finally from these two code units we build proper UTF-8 sequence,
          taking into account the case (i.e, the number of bytes to write).
        */
        /**
         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
         * t2 => [0ccc|cccc] [10cc|cccc]
         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
         */
#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
        const uint16x8_t t0 =
            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
                                            vreinterpretq_u8_u16(dup_even)));
        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));

        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
        const uint16x8_t s1 =
            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
        // [00bb|bbbb|0000|aaaa]
        const uint16x8_t s2 = vorrq_u16(s0, s1s);
        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
        const uint16x8_t one_or_two_bytes_bytemask =
            vcleq_u16(utf16_packed, v_07ff);
        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
                                        one_or_two_bytes_bytemask);
        const uint16x8_t s4 = veorq_u16(s3, m0);
#undef simdutf_vec

        // 4. expand code units 16-bit => 32-bit
        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));

        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
        const uint16x8_t onemask = simdutf_make_uint16x8_t(
            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
        const uint16x8_t twomask = simdutf_make_uint16x8_t(
            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
#else
        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
                                    0x0100, 0x0400, 0x1000, 0x4000};
        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
                                    0x0200, 0x0800, 0x2000, 0x8000};
#endif
        const uint16x8_t combined =
            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
                      vandq_u16(one_or_two_bytes_bytemask, twomask));
        const uint16_t mask = vaddvq_u16(combined);
        // The following fast path may or may not be beneficial.
        /*if(mask == 0) {
          // We only have three-byte code units. Use fast path.
          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
          vst1q_u8(utf8_output, utf8_0);
          utf8_output += 12;
          vst1q_u8(utf8_output, utf8_1);
          utf8_output += 12;
          buf += 8;
          continue;
        }*/
        const uint8_t mask0 = uint8_t(mask);

        const uint8_t *row0 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);

        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);

        vst1q_u8(utf8_output, utf8_0);
        utf8_output += row0[0];
        vst1q_u8(utf8_output, utf8_1);
        utf8_output += row1[0];

        buf += 8;
      }
      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes.
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(
                result(error_code::SURROGATE, buf - start + k),
                reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(
                result(error_code::TOO_LARGE, buf - start + k),
                reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        reinterpret_cast<char *>(utf8_output));
}
/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */

} // unnamed namespace
} // namespace arm64
} // namespace simdutf

/* begin file src/generic/buf_block_reader.h */
namespace simdutf {
namespace arm64 {
namespace {

// Walks through a buffer in block-sized increments, loading the last part with
// spaces
template <size_t STEP_SIZE> struct buf_block_reader {
public:
  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
  simdutf_really_inline size_t block_index();
  simdutf_really_inline bool has_full_block() const;
  simdutf_really_inline const uint8_t *full_block() const;
  /**
   * Get the last block, padded with spaces.
   *
   * There will always be a last block, with at least 1 byte, unless len == 0
   * (in which case this function fills the buffer with spaces and returns 0. In
   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
   * block with STEP_SIZE bytes and no spaces for padding.
   *
   * @return the number of effective characters in the last block.
   */
  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
  simdutf_really_inline void advance();

private:
  const uint8_t *buf;
  const size_t len;
  const size_t lenminusstep;
  size_t idx;
};

template <size_t STEP_SIZE>
simdutf_really_inline
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
      idx{0} {}

template <size_t STEP_SIZE>
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
  return idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
  return idx < lenminusstep;
}

template <size_t STEP_SIZE>
simdutf_really_inline const uint8_t *
buf_block_reader<STEP_SIZE>::full_block() const {
  return &buf[idx];
}

template <size_t STEP_SIZE>
simdutf_really_inline size_t
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
  if (len == idx) {
    return 0;
  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
  std::memset(dst, 0x20,
              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
                          // to write out 8 or 16 bytes at once.
  std::memcpy(dst, buf + idx, len - idx);
  return len - idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
  idx += STEP_SIZE;
}

} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/buf_block_reader.h */
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8_validation {

using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  static const uint8_t max_array[32] = {255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        0b11110000u - 1,
                                        0b11100000u - 1,
                                        0b11000000u - 1};
  const simd8<uint8_t> max_value(
      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
  return input.gt_bits(max_value);
}

struct utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
  // The last input we received
  simd8<uint8_t> prev_input_block;
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  simd8<uint8_t> prev_incomplete;

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error |= this->prev_incomplete;
  }

  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
    if (simdutf_likely(is_ascii(input))) {
      this->error |= this->prev_incomplete;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
                    "We support either two or four chunks per 64-byte block.");
      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
      }
      this->prev_incomplete =
          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
    }
  }

  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_validation

using utf8_validation::utf8_checker;

} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
/* begin file src/generic/utf8_validation/utf8_validator.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8_validation {

/**
 * Validates that the string is actual UTF-8.
 */
template <class checker>
bool generic_validate_utf8(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  return !c.errors();
}

bool generic_validate_utf8(const char *input, size_t length) {
  return generic_validate_utf8<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

/**
 * Validates that the string is actual UTF-8 and stops on errors.
 */
template <class checker>
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    if (c.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(input),
          reinterpret_cast<const char *>(input + count), length - count);
      res.count += count;
      return res;
    }
    reader.advance();
    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  if (c.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(input),
        reinterpret_cast<const char *>(input) + count, length - count);
    res.count += count;
    return res;
  } else {
    return result(error_code::SUCCESS, length);
  }
}

result generic_validate_utf8_with_errors(const char *input, size_t length) {
  return generic_validate_utf8_with_errors<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

} // namespace utf8_validation
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_validator.h */

/* begin file src/generic/ascii_validation.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace ascii_validation {

result generic_validate_ascii_with_errors(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      result res = scalar::ascii::validate_with_errors(
          reinterpret_cast<const char *>(input + count), length - count);
      return result(res.error, count + res.count);
    }
    reader.advance();

    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  if (!in.is_ascii()) {
    result res = scalar::ascii::validate_with_errors(
        reinterpret_cast<const char *>(input + count), length - count);
    return result(res.error, count + res.count);
  } else {
    return result(error_code::SUCCESS, length);
  }
}

bool generic_validate_ascii(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      return false;
    }
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  return in.is_ascii();
}

} // namespace ascii_validation
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/ascii_validation.h */
  // transcoding from UTF-8 to UTF-32
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8_to_utf32 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (utf8_continuation_mask & 1) {
          return 0; // we have an error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
      if (howmany == 0) {
        return 0;
      }
      utf32_output += howmany;
    }
    return utf32_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (errors() || (utf8_continuation_mask & 1)) {
          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, utf32_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        utf32_output += res.count;
      }
    }
    return result(error_code::SUCCESS, utf32_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8_to_utf32 {

using namespace simd;

simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char32_t *utf32_output) noexcept {
  size_t pos = 0;
  char32_t *start{utf32_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf32(utf32_output);
      utf32_output += 64;
      pos += 64;
    } else {
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      size_t max_starting_point = (pos + 64) - 12;
      while (pos < max_starting_point) {
        size_t consumed = convert_masked_utf8_to_utf32(
            input + pos, utf8_end_of_code_point_mask, utf32_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
    }
  }
  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
                                                       utf32_output);
  return utf32_output - start;
}

} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
// other functions
/* begin file src/generic/utf8.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8 {

using namespace simd;

simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.gt(-65);
    count += count_ones(utf8_continuation_mask);
  }
  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
                                                        size_t size) {
  using vector_i8 = simd8<int8_t>;
  using vector_u8 = simd8<uint8_t>;
  using vector_u64 = simd64<uint64_t>;

  constexpr size_t N = vector_i8::SIZE;
  constexpr size_t max_iterations = 255 / 4;

  size_t pos = 0;
  size_t count = 0;

  auto counters = vector_u64::zero();
  auto local = vector_u8::zero();
  size_t iterations = 0;
  for (; pos + 4 * N <= size; pos += 4 * N) {
    const auto input0 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
    const auto input1 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
    const auto input2 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
    const auto input3 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
    const auto mask0 = input0 > int8_t(-65);
    const auto mask1 = input1 > int8_t(-65);
    const auto mask2 = input2 > int8_t(-65);
    const auto mask3 = input3 > int8_t(-65);

    local -= vector_u8(mask0);
    local -= vector_u8(mask1);
    local -= vector_u8(mask2);
    local -= vector_u8(mask3);

    iterations += 1;
    if (iterations == max_iterations) {
      counters += sum_8bytes(local);
      local = vector_u8::zero();
      iterations = 0;
    }
  }

  if (iterations > 0) {
    count += local.sum_bytes();
  }

  count += counters.sum();

  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
    // We count one word for anything that is not a continuation (so
    // leading bytes).
    count += 64 - count_ones(utf8_continuation_mask);
    int64_t utf8_4byte = input.gteq_unsigned(240);
    count += count_ones(utf8_4byte);
  }
  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
}

} // namespace utf8
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/utf8.h */
  // transcoding from UTF-8 to Latin 1
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
  // 0b11000010 and nothing else.
  //
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
  constexpr const uint8_t FORBIDDEN = 0xff;

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      FORBIDDEN,
      // 1110____ ________ <three byte lead in byte 1>
      FORBIDDEN,
      // 1111____ ________ <four+ byte lead in byte 1>
      FORBIDDEN);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              FORBIDDEN,
              // ____0101 ________
              FORBIDDEN,
              // ____011_ ________
              FORBIDDEN, FORBIDDEN,

              // ____1___ ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
              // ____1101 ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    this->error |= check_special_cases(input, prev1);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 16; margin--) {
      leading_byte += (int8_t(in[margin - 1]) >
                       -65); // twos complement of -65 is 1011 1111 ...
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask =
            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                               // this case, we also have ASCII to account for.
        if (utf8_continuation_mask & 1) {
          return 0; // error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
      if (howmany == 0) {
        return 0;
      }
      latin1_output += howmany;
    }
    return latin1_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        if (errors()) {
          // rewind_and_convert_with_errors will seek a potential error from
          // in+pos onward, with the ability to go back up to pos bytes, and
          // read size-pos bytes forward.
          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, latin1_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        latin1_output += res.count;
      }
    }
    return result(error_code::SUCCESS, latin1_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_latin1
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace arm64
} // namespace simdutf
  // namespace simdutf
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
/* begin file src/generic/base64lengths.h */
namespace simdutf {
namespace arm64 {
namespace {
namespace base64_lengths {

simdutf_warn_unused size_t binary_length_from_base64(const char *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= length; pos += 64) {
    simd8x64<uint8_t> block(reinterpret_cast<const uint8_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 32 <= length; pos += 32) {
    simd16x32<uint16_t> block(reinterpret_cast<const uint16_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char16_t c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

} // namespace base64_lengths
} // unnamed namespace
} // namespace arm64
} // namespace simdutf
/* end file src/generic/base64lengths.h */

//
// Implementation-specific overrides
//
namespace simdutf {
namespace arm64 {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return arm64::utf8_validation::generic_validate_utf8(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return arm64::ascii_validation::generic_validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return arm64::ascii_validation::generic_validate_ascii_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    // empty input is valid. protected the implementation from nullptr.
    return true;
  }
  const char32_t *tail = arm_validate_utf32le(buf, len);
  if (tail) {
    return scalar::utf32::validate(tail, len - (tail - buf));
  } else {
    return false;
  }
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, 0);
  }
  result res = arm_validate_utf32le_with_errors(buf, len);
  if (res.count != len) {
    result scalar_res =
        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
    return result(scalar_res.error, res.count + scalar_res.count);
  } else {
    return res;
  }
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char *, char *> ret =
      arm_convert_latin1_to_utf8(buf, len, utf8_output);
  size_t converted_chars = ret.second - utf8_output;

  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  std::pair<const char *, char32_t *> ret =
      arm_convert_latin1_to_utf32(buf, len, utf32_output);
  size_t converted_chars = ret.second - utf32_output;
  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return 0;
  }
  std::pair<const char32_t *, char *> ret =
      arm_convert_utf32_to_utf8(buf, len, utf8_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - utf8_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, 0);
  }
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      utf8_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      arm_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;

  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<result, char *> ret =
      arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
  if (ret.first.error) {
    return ret.first;
  } // Can return directly since scalar fallback already found correct
    // ret.first.count
  if (ret.first.count != len) { // All good so far, but not finished
    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      latin1_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      arm_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;

  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
        ret.first, len - (ret.first - buf), ret.second);
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  // optimization opportunity: implement a custom function.
  return convert_utf32_to_utf8(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t length) const noexcept {
  // See
  // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
  // credit to Pete Cawley
  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
  uint64_t result = 0;
  const int lanes = sizeof(uint8x16_t);
  uint8_t rem = length % lanes;
  const uint8_t *simd_end = data + (length / lanes) * lanes;
  const uint8x16_t threshold = vdupq_n_u8(0x80);
  for (; data < simd_end; data += lanes) {
    // load 16 bytes
    uint8x16_t input_vec = vld1q_u8(data);
    // compare to threshold (0x80)
    uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
    // vertical addition
    result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
  }
  return result + (length / lanes) * lanes +
         scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 4 <= length; pos += 4) {
    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
    const uint32x4_t two_bytes_bytemask =
        veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
    const uint32x4_t three_bytes_bytemask =
        veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);

    const uint16x8_t reduced_ascii_bytes_bytemask =
        vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
    const uint16x8_t reduced_two_bytes_bytemask =
        vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
    const uint16x8_t reduced_three_bytes_bytemask =
        vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));

    const uint16x8_t compressed_bytemask0 =
        vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
    const uint16x8_t compressed_bytemask1 =
        vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);

    size_t ascii_count = count_ones(
        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
    size_t two_bytes_count = count_ones(
        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
    size_t three_bytes_count = count_ones(
        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));

    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
  }
  return count +
         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  return encode_base64(output, input, length, options);
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  return encode_base64_impl<true>(output, input, length, options, line_length);
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util_find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util_find(start, end, character);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

} // namespace arm64
} // namespace simdutf

/* begin file src/simdutf/arm64/end.h */
/* end file src/simdutf/arm64/end.h */
/* end file src/arm64/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_FALLBACK
/* begin file src/fallback/implementation.cpp */
/* begin file src/simdutf/fallback/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "fallback"
// #define SIMDUTF_IMPLEMENTATION fallback
/* end file src/simdutf/fallback/begin.h */

namespace simdutf {
namespace fallback {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return scalar::utf8::validate(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return scalar::utf8::validate_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return scalar::ascii::validate(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return scalar::ascii::validate_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  return scalar::utf32::validate(buf, len);
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  return scalar::utf32::validate_with_errors(buf, len);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  return scalar::utf8::count_code_points(input, length);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return scalar::utf8::count_code_points(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t length) const noexcept {
  return scalar::latin1_to_utf8::utf8_length_from_latin1(input, length);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return scalar::utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return scalar::utf8::count_code_points(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64(output, input, length, options);
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64_impl<true>(output, input, length,
                                                       options, line_length);
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return std::find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return std::find(start, end, character);
}

} // namespace fallback
} // namespace simdutf

/* begin file src/simdutf/fallback/end.h */
/* end file src/simdutf/fallback/end.h */
/* end file src/fallback/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_ICELAKE
/* begin file src/icelake/implementation.cpp */
#include <tuple>
#include <utility>

/* begin file src/simdutf/icelake/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "icelake"
// #define SIMDUTF_IMPLEMENTATION icelake

#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
// nothing needed.
#else
SIMDUTF_TARGET_ICELAKE
#endif

#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
// clang-format off
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
// clang-format on
#endif // end of workaround
/* end file src/simdutf/icelake/begin.h */
namespace simdutf {
namespace icelake {
namespace {
#ifndef SIMDUTF_ICELAKE_H
  #error "icelake.h must be included"
#endif
using namespace simd;

/* begin file src/icelake/icelake_macros.inl.cpp */
/*
    This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
   UTF-8 string) and loads all possible 4-byte substring into an AVX512
   register.

    For example if we have bytes abcdefgh... we create following 32-bit lanes

    [abcd|bcde|cdef|defg|efgh|...]
     ^                          ^
     byte 0 of reg              byte 63 of reg
*/
/** pshufb
        # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10,
   11, 12, 13, 14, 15] # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,
   6,  8,  9, 10, 11, 12, 13, 14, 15]

        expand_ver2 = [
            # lane 0:
            0, 1, 2, 3,
            1, 2, 3, 4,
            2, 3, 4, 5,
            3, 4, 5, 6,

            # lane 1:
            4, 5, 6, 7,
            5, 6, 7, 8,
            6, 7, 8, 9,
            7, 8, 9, 10,

            # lane 2:
             8,  9, 10, 11,
             9, 10, 11, 12,
            10, 11, 12, 13,
            11, 12, 13, 14,

            # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
   17, 18, 19 12, 13, 14, 15, 13, 14, 15,  0, 14, 15,  0,  1, 15,  0,  1,  2,
        ]
*/

#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                      \
  {                                                                            \
    const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);        \
    const __m512i expand_ver2 = _mm512_setr_epi64(                             \
        0x0403020103020100, 0x0605040305040302, 0x0807060507060504,            \
        0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,            \
        0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);                               \
    const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);            \
                                                                               \
    __mmask16 leading_bytes;                                                   \
    const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                       \
    const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                   \
    const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                       \
    leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                 \
                                                                               \
    __m512i char_class;                                                        \
    char_class = _mm512_srli_epi32(input, 4);                                  \
    /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                     \
    const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                       \
    const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                 \
    char_class =                                                               \
        _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
                                                                               \
    const int valid_count = static_cast<int>(count_ones(leading_bytes));       \
    const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);           \
                                                                               \
    const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(),     \
                                                   leading_bytes, utf32);      \
                                                                               \
    if (UTF32) {                                                               \
      if (MASKED) {                                                            \
        const __mmask16 valid = uint16_t((1 << valid_count) - 1);              \
        _mm512_mask_storeu_epi32((__m512i *)output, valid, out);               \
      } else {                                                                 \
        _mm512_storeu_si512((__m512i *)output, out);                           \
      }                                                                        \
      output += valid_count;                                                   \
    } else {                                                                   \
      if (MASKED) {                                                            \
        output += utf32_to_utf16_masked<big_endian>(                           \
            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
      } else {                                                                 \
        output += utf32_to_utf16<big_endian>(                                  \
            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
      }                                                                        \
    }                                                                          \
  }

#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)       \
  {                                                                            \
    if (UTF32) {                                                               \
      if (MASKED) {                                                            \
        const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);         \
        _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT);        \
      } else {                                                                 \
        _mm512_storeu_si512((__m512i *)output, INPUT);                         \
      }                                                                        \
      output += VALID_COUNT;                                                   \
    } else {                                                                   \
      if (MASKED) {                                                            \
        output += utf32_to_utf16_masked<big_endian>(                           \
            byteflip, INPUT, VALID_COUNT,                                      \
            reinterpret_cast<char16_t *>(output));                             \
      } else {                                                                 \
        output +=                                                              \
            utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT,           \
                                       reinterpret_cast<char16_t *>(output));  \
      }                                                                        \
    }                                                                          \
  }

#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                       \
  if (UTF32) {                                                                 \
    const __m128i t0 = _mm512_castsi512_si128(utf8);                           \
    const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                     \
    const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                     \
    const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                     \
    _mm512_storeu_si512((__m512i *)(output + 0 * 16),                          \
                        _mm512_cvtepu8_epi32(t0));                             \
    _mm512_storeu_si512((__m512i *)(output + 1 * 16),                          \
                        _mm512_cvtepu8_epi32(t1));                             \
    _mm512_storeu_si512((__m512i *)(output + 2 * 16),                          \
                        _mm512_cvtepu8_epi32(t2));                             \
    _mm512_storeu_si512((__m512i *)(output + 3 * 16),                          \
                        _mm512_cvtepu8_epi32(t3));                             \
  } else {                                                                     \
    const __m256i h0 = _mm512_castsi512_si256(utf8);                           \
    const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                     \
    if (big_endian) {                                                          \
      _mm512_storeu_si512(                                                     \
          (__m512i *)(output + 0 * 16),                                        \
          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip));            \
      _mm512_storeu_si512(                                                     \
          (__m512i *)(output + 2 * 16),                                        \
          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip));            \
    } else {                                                                   \
      _mm512_storeu_si512((__m512i *)(output + 0 * 16),                        \
                          _mm512_cvtepu8_epi16(h0));                           \
      _mm512_storeu_si512((__m512i *)(output + 2 * 16),                        \
                          _mm512_cvtepu8_epi16(h1));                           \
    }                                                                          \
  }
/* end file src/icelake/icelake_macros.inl.cpp */
/* begin file src/icelake/icelake_common.inl.cpp */
// file included directly
/**
 * Store the last N bytes of previous followed by 512-N bytes from input.
 */
template <int N> __m512i prev(__m512i input, __m512i previous) {
  static_assert(N <= 32, "N must be no larger than 32");
  const __m512i movemask =
      _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
  const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
#if SIMDUTF_GCC8 || SIMDUTF_GCC9
  constexpr int shift = 16 - N; // workaround for GCC8,9
  return _mm512_alignr_epi8(input, rotated, shift);
#else
  return _mm512_alignr_epi8(input, rotated, 16 - N);
#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
}

template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
__m512i shuffle_epi128(__m512i v) {
  static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
  static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
  static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
  static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");

  constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
  return _mm512_shuffle_i32x4(v, v, shuffle);
}

template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
  return shuffle_epi128<idx, idx, idx, idx>(v);
}

simdutf_really_inline __m512i broadcast_128bit_lane(__m128i lane) {
  const __m512i tmp = _mm512_castsi128_si512(lane);

  return broadcast_epi128<0>(tmp);
}
/* end file src/icelake/icelake_common.inl.cpp */
/* begin file src/icelake/icelake_utf8_common.inl.cpp */
// Common procedures for both validating and non-validating conversions from
// UTF-8.
enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };

using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;

/*
    process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
    to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
    might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
    indicates how many input bytes are relevant.

    Returns true when the result is correct, otherwise it returns false.

    The provided in and out pointers are advanced according to how many input
    bytes have been processed, upon success.
*/
template <block_processing_mode tail, endianness big_endian>
simdutf_really_inline bool
process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
  // constants
  __m512i mask_identity = _mm512_set_epi8(
      63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
      45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
      27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
      8, 7, 6, 5, 4, 3, 2, 1, 0);
  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
      0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  // Note that 'tail' is a compile-time constant !
  __mmask64 b =
      (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
                                         : _mm512_maskz_loadu_epi8(b, in);
  __mmask64 m1 = (tail == SIMDUTF_FULL)
                     ? _mm512_cmplt_epu8_mask(input, mask_80808080)
                     : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
  if (_ktestc_mask64_u8(m1,
                        b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
                              // alternatively, we could do 'if (m1 == b) { '
    if (tail == SIMDUTF_FULL) {
      in += 64; // consumed 64 bytes
      // we convert a full 64-byte block, writing 128 bytes.
      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
      if (big_endian) {
        input1 = _mm512_shuffle_epi8(input1, byteflip);
      }
      _mm512_storeu_si512(out, input1);
      out += 32;
      __m512i input2 =
          _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
      if (big_endian) {
        input2 = _mm512_shuffle_epi8(input2, byteflip);
      }
      _mm512_storeu_si512(out, input2);
      out += 32;
      return true; // we are done
    } else {
      in += gap;
      if (gap <= 32) {
        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
        if (big_endian) {
          input1 = _mm512_shuffle_epi8(input1, byteflip);
        }
        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
                                 input1);
        out += gap;
      } else {
        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
        if (big_endian) {
          input1 = _mm512_shuffle_epi8(input1, byteflip);
        }
        _mm512_storeu_si512(out, input1);
        out += 32;
        __m512i input2 =
            _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
        if (big_endian) {
          input2 = _mm512_shuffle_epi8(input2, byteflip);
        }
        _mm512_mask_storeu_epi16(
            out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
        out += gap - 32;
      }
      return true; // we are done
    }
  }
  // classify characters further
  __mmask64 m234 = _mm512_cmp_epu8_mask(
      mask_c0c0c0c0, input,
      _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
  __mmask64 m34 =
      _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
                           _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte

  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
      m234, input, mask_c2c2c2c2,
      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
                      // Overlong 2-byte sequence
  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
    // Overlong 2-byte sequence
    return false;
  }
  if (_ktestz_mask64_u8(m34, m34) == 0) {
    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
    // 4-byte sequence!
    __mmask64 m4 = _mm512_cmp_epu8_mask(
        input, mask_f0f0f0f0,
        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)

    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
                                   ? _knot_mask64(m1)
                                   : _kand_mask64(_knot_mask64(m1), b);

    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
    // We could do it as follows...
    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
    // masks a and b and return 1 if all zeroes but GCC generates better code
    // when we do:
    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
                   // return 1 if all zeroes
      // Fast path with 1,2,3 bytes
      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
      __mmask64 m1234 = _kor_mask64(m1, m234);
      // mismatched continuation bytes:
      if (tail == SIMDUTF_FULL) {
        __mmask64 xnormcm1234 = _kxnor_mask64(
            mc,
            m1234); // XNOR of mc and m1234 should be all zero if they differ
        // the presence of a 1 bit indicates that they overlap.
        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
        // 1 if all zeroes.
        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
          return false;
        }
      } else {
        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
        if (mc != bxorm1234) {
          return false;
        }
      }
      // mend: identifying the last bytes of each sequence to be decoded
      __mmask64 mend = _kshiftri_mask64(m1234, 1);
      if (tail != SIMDUTF_FULL) {
        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
      }

      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
      __m512i last_and_thirdu16 =
          _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));

      __m512i nonasciitags = _mm512_maskz_mov_epi8(
          mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
      __m512i clearedbytes = _mm512_andnot_si512(
          nonasciitags, input); // high two bits cleared where not ASCII
      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
          0x5555555555555555, last_and_thirdu16,
          clearedbytes); // the last byte of each character

      __mmask64 mask_before_non_ascii = _kshiftri_mask64(
          mask_not_ascii, 1); // bytes that precede non-ASCII bytes
      __m512i indexofsecondlastbytes = _mm512_add_epi16(
          mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
      __m512i beforeasciibytes =
          _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
          0x5555555555555555, indexofsecondlastbytes,
          beforeasciibytes); // the second last bytes (of two, three byte seq,
                             // surrogates)
      secondlastbytes =
          _mm512_slli_epi16(secondlastbytes, 6); // shifted into position

      __m512i indexofthirdlastbytes = _mm512_add_epi16(
          mask_ffffffff,
          indexofsecondlastbytes); // indices of the second last bytes
      __m512i thirdlastbyte =
          _mm512_maskz_mov_epi8(m34,
                                clearedbytes); // only those that are the third
                                               // last byte of a sequence
      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
          0x5555555555555555, indexofthirdlastbytes,
          thirdlastbyte); // the third last bytes (of three byte sequences, hi
                          // surrogate)
      thirdlastbytes =
          _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
                                               thirdlastbytes, 254);
      // the elements of Wout excluding the last element if it happens to be a
      // high surrogate:

      __mmask64 mprocessed =
          (tail == SIMDUTF_FULL)
              ? _pdep_u64(0xFFFFFFFF, mend)
              : _pdep_u64(
                    0xFFFFFFFF,
                    _kand_mask64(
                        mend, b)); // we adjust mend at the end of the output.

      // Encodings out of range...
      {
        // the location of 3-byte sequence start bytes in the input
        __mmask64 m3 = m34 & (b ^ m4);
        // code units in Wout corresponding to 3-byte sequences.
        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
        __mmask32 Msmall800 =
            _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
        __mmask32 M3s =
            _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
        if (_kor_mask32(Msmall800, M3s)) {
          return false;
        }
      }
      int64_t nout = _mm_popcnt_u64(mprocessed);
      in += 64 - _lzcnt_u64(mprocessed);
      if (big_endian) {
        Wout = _mm512_shuffle_epi8(Wout, byteflip);
      }
      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
      out += nout;
      return true; // ok
    }
    //
    // We have a 4-byte sequence, this is the general case.
    // Slow!
    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
    __mmask64 mc =
        _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
    __mmask64 m1234 = _kor_mask64(m1, m234);

    // mend: identifying the last bytes of each sequence to be decoded
    __mmask64 mend =
        _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
    if (tail != SIMDUTF_FULL) {
      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
    }
    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
    __m512i last_and_thirdu16 =
        _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));

    __m512i nonasciitags = _mm512_maskz_mov_epi8(
        mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
    __m512i clearedbytes = _mm512_andnot_si512(
        nonasciitags, input); // high two bits cleared where not ASCII
    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
        0x5555555555555555, last_and_thirdu16,
        clearedbytes); // the last byte of each character

    __mmask64 mask_before_non_ascii = _kshiftri_mask64(
        mask_not_ascii, 1); // bytes that precede non-ASCII bytes
    __m512i indexofsecondlastbytes = _mm512_add_epi16(
        mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
    __m512i beforeasciibytes =
        _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
        0x5555555555555555, indexofsecondlastbytes,
        beforeasciibytes); // the second last bytes (of two, three byte seq,
                           // surrogates)
    secondlastbytes =
        _mm512_slli_epi16(secondlastbytes, 6); // shifted into position

    __m512i indexofthirdlastbytes = _mm512_add_epi16(
        mask_ffffffff,
        indexofsecondlastbytes); // indices of the second last bytes
    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(
        m34,
        clearedbytes); // only those that are the third last byte of a sequence
    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
        0x5555555555555555, indexofthirdlastbytes,
        thirdlastbyte); // the third last bytes (of three byte sequences, hi
                        // surrogate)
    thirdlastbytes =
        _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
        lastbytes, secondlastbytes, thirdlastbytes, 254);
    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
    __mmask32 Mlo = __mmask32(Mlo_uint64);
    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(
        Mlo,
        mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
    __m512i shifted4_thirdsecondandlastbytes =
        _mm512_srli_epi16(thirdsecondandlastbytes,
                          4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
    __m512i tagged_lo_surrogates = _mm512_or_si512(
        thirdsecondandlastbytes,
        lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
    __m512i Wout = _mm512_mask_add_epi16(
        tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
        mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
    // the elements of Wout excluding the last element if it happens to be a
    // high surrogate:
    __mmask32 Mout = ~(Mhi & 0x80000000);
    __mmask64 mprocessed =
        (tail == SIMDUTF_FULL)
            ? _pdep_u64(Mout, mend)
            : _pdep_u64(
                  Mout,
                  _kand_mask64(mend,
                               b)); // we adjust mend at the end of the output.

    // mismatched continuation bytes:
    if (tail == SIMDUTF_FULL) {
      __mmask64 xnormcm1234 = _kxnor_mask64(
          mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
      // the presence of a 1 bit indicates that they overlap.
      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
      // if all zeroes.
      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
        return false;
      }
    } else {
      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
      if (mc != bxorm1234) {
        return false;
      }
    }
    // Encodings out of range...
    {
      // the location of 3-byte sequence start bytes in the input
      __mmask64 m3 = m34 & (b ^ m4);
      // code units in Wout corresponding to 3-byte sequences.
      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
      __mmask32 Msmall800 =
          _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
      __mmask32 M3s =
          _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
      __mmask32 M4s =
          _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
        return false;
      }
    }
    in += 64 - _lzcnt_u64(mprocessed);
    int64_t nout = _mm_popcnt_u64(mprocessed);
    if (big_endian) {
      Wout = _mm512_shuffle_epi8(Wout, byteflip);
    }
    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
    out += nout;
    return true; // ok
  }
  // Fast path 2: all ASCII or 2 byte
  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
                                        ? _knot_mask64(m234)
                                        : _kand_mask64(_knot_mask64(m234), b);
  // on top of -0xc0 we subtract -2 which we get back later of the
  // continuation byte tags
  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
  __mmask64 leading = tail == (tail == SIMDUTF_FULL)
                          ? _kor_mask64(m1, m234)
                          : _kand_mask64(_kor_mask64(m1, m234),
                                         b); // first bytes of each sequence
  if (tail == SIMDUTF_FULL) {
    __mmask64 xnor234leading =
        _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
      return false;
    }
  } else {
    __mmask64 bxorleading = _kxor_mask64(b, leading);
    if (_kshiftli_mask64(m234, 1) != bxorleading) {
      return false;
    }
  }
  //
  if (tail == SIMDUTF_FULL) {
    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
    // to increment the input buffer as quickly as possible.
    // We process 32 bytes unless the byte at index 32 is a continuation byte,
    // in which case we include it as well for a total of 33 bytes.
    // Note that if x is an ASCII byte, then the following is false:
    // int8_t(x) <= int8_t(0xc0) under two's complement.
    in += 32;
    if (int8_t(*in) <= int8_t(0xc0))
      in++;
    // The alternative is to do
    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
    // but it requires loading the input, doing the mask computation, and
    // converting back the mask to a general register. It just takes too long,
    // leaving the processor likely to be idle.
  } else {
    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
  }
  __m512i lead = _mm512_maskz_compress_epi8(
      leading, leading2byte); // will contain zero for ascii, and the data
  lead = _mm512_cvtepu8_epi16(
      _mm512_castsi512_si256(lead)); // ... zero extended into code units
  __m512i follow = _mm512_maskz_compress_epi8(
      continuation_or_ascii, input); // the last bytes of each sequence
  follow = _mm512_cvtepu8_epi16(
      _mm512_castsi512_si256(follow)); // ... zero extended into code units
  lead = _mm512_slli_epi16(lead, 6);   // shifted into position
  __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow

  if (big_endian) {
    final = _mm512_shuffle_epi8(final, byteflip);
  }
  if (tail == SIMDUTF_FULL) {
    // Next part is UTF-16 specific and can be generalized to UTF-32.
    int nout = _mm_popcnt_u32(uint32_t(leading));
    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
  } else {
    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
  }

  return true; // we are fine.
}

/*
    utf32_to_utf16_masked converts `count` lower UTF-32 code units
    from input `utf32` into UTF-16. It differs from utf32_to_utf16
    in that it 'masks' the writes.

    Returns how many 16-bit code units were stored.

    byteflip is used for flipping 16-bit code units, and it should be
        __m512i byteflip = _mm512_setr_epi64(
            0x0607040502030001,
            0x0e0f0c0d0a0b0809,
            0x0607040502030001,
            0x0e0f0c0d0a0b0809,
            0x0607040502030001,
            0x0e0f0c0d0a0b0809,
            0x0607040502030001,
            0x0e0f0c0d0a0b0809
        );
    We pass it to the (always inlined) function to encourage the compiler to
    keep the value in a (constant) register.
*/
template <endianness big_endian>
simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
                                                   __m512i utf32,
                                                   unsigned int count,
                                                   char16_t *output) {

  const __mmask16 valid = uint16_t((1 << count) - 1);
  // 1. check if we have any surrogate pairs
  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
  const __mmask16 sp_mask =
      _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);

  if (sp_mask == 0) {
    if (big_endian) {
      _mm256_mask_storeu_epi16(
          (__m256i *)output, valid,
          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
                              _mm512_castsi512_si256(byteflip)));

    } else {
      _mm256_mask_storeu_epi16((__m256i *)output, valid,
                               _mm512_cvtepi32_epi16(utf32));
    }
    return count;
  }

  {
    // build surrogate pair code units in 32-bit lanes

    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);

    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
    const __m512i t1 = _mm512_slli_epi32(t0, 6);

    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
    //    to t0
    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);

    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
    //    to t0
    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
    const __m512i t3 =
        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
    __m512i t5 = _mm512_ror_epi32(t4, 16);
    // Here we want to trim all of the upper 16-bit code units from the 2-byte
    // characters represented as 4-byte values. We can compute it from
    // sp_mask or the following... It can be more optimized!
    const __mmask32 nonzero = _kor_mask32(
        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
    const __mmask32 nonzero_masked =
        _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
    if (big_endian) {
      t5 = _mm512_shuffle_epi8(t5, byteflip);
    }
    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
    // (AMD Zen4 has terrible performance with it, it is effectively broken)
    __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
    _mm512_mask_storeu_epi16(
        output, _bzhi_u32(0xFFFFFFFF, count + _mm_popcnt_u32(sp_mask)),
        compressed);
    //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
  }

  return count + static_cast<unsigned int>(count_ones(sp_mask));
}

/*
    utf32_to_utf16 converts `count` lower UTF-32 code units
    from input `utf32` into UTF-16. It may overflow.

    Returns how many 16-bit code units were stored.

    byteflip is used for flipping 16-bit code units, and it should be
        __m512i byteflip = _mm512_setr_epi64(
            0x0607040502030001,
            0x0e0f0c0d0a0b0809,
            0x0607040502030001,
            0x0e0f0c0d0a0b0809,
            0x0607040502030001,
            0x0e0f0c0d0a0b0809,
            0x0607040502030001,
            0x0e0f0c0d0a0b0809
        );
    We pass it to the (always inlined) function to encourage the compiler to
    keep the value in a (constant) register.
*/
template <endianness big_endian>
simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
                                            __m512i utf32, unsigned int count,
                                            char16_t *output) {
  // check if we have any surrogate pairs
  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
  const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);

  if (sp_mask == 0) {
    // technically, it should be _mm256_storeu_epi16
    if (big_endian) {
      _mm256_storeu_si256(
          (__m256i *)output,
          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
                              _mm512_castsi512_si256(byteflip)));
    } else {
      _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
    }
    return count;
  }

  {
    // build surrogate pair code units in 32-bit lanes

    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);

    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
    const __m512i t1 = _mm512_slli_epi32(t0, 6);

    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
    //    to t0
    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);

    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
    //    to t0
    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
    const __m512i t3 =
        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
    __m512i t5 = _mm512_ror_epi32(t4, 16);
    const __mmask32 nonzero = _kor_mask32(
        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
    if (big_endian) {
      t5 = _mm512_shuffle_epi8(t5, byteflip);
    }
    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
    // (zen4)
    __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
    _mm512_mask_storeu_epi16(
        output,
        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
        compressed);
    //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
  }

  return count + static_cast<unsigned int>(count_ones(sp_mask));
}

/*
    expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
    stored at separate 32-bit lanes.

    For each lane we have also a character class (`char_class), given in form
    0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
    corresponding bytes during pshufb.
*/
simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
                                                     __m512i utf8) {
  /*
      Input:
      - utf8: bytes stored at separate 32-bit code units
      - valid: which code units have valid UTF-8 characters

      Bit layout of single word. We show 4 cases for each possible
      UTF-8 character encoding. The `?` denotes bits we must not
      assume their value.

      |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
      |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
      |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
      |????.????|????.????|????.????|0aaa.aaaa| ASCII char
        byte 3    byte 2    byte 1     byte 0
  */

  /* 1. Reset control bits of continuation bytes and the MSB
        of the leading byte; this makes all bytes unsigned (and
        does not alter ASCII char).

      |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
      |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
      |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
      |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
       ^^        ^^        ^^        ^
  */
  __m512i values;
  const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
  values = _mm512_and_si512(utf8, v_3f3f_3f7f);

  /* 2. Swap and join fields A-B and C-D

      |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
      |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
      |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
      |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
  const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
  values = _mm512_maddubs_epi16(values, v_0140_0140);

  /* 3. Swap and join fields AB & CD

      |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
      |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
      |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
      |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
  const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
  values = _mm512_madd_epi16(values, v_0001_1000);

  /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
      |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
      |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
      |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
      |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
  {
    /** pshufb

    continuation = 0
    ascii    = 7
    _2_bytes = 9
    _3_bytes = 10
    _4_bytes = 11

    shift_left_v3 = 4 * [
        ascii, # 0000
        ascii, # 0001
        ascii, # 0010
        ascii, # 0011
        ascii, # 0100
        ascii, # 0101
        ascii, # 0110
        ascii, # 0111
        continuation, # 1000
        continuation, # 1001
        continuation, # 1010
        continuation, # 1011
        _2_bytes, # 1100
        _2_bytes, # 1101
        _3_bytes, # 1110
        _4_bytes, # 1111
    ] */
    const __m512i shift_left_v3 = _mm512_setr_epi64(
        0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
        0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
        0x0707070707070707, 0x0b0a090900000000);

    const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
    values = _mm512_sllv_epi32(values, shift);
  }

  /* 5. Shift right the values by variable amounts to reset lowest bits
      |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
      |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
      |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
      |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
  {
    // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
    const __m512i shift_right = _mm512_setr_epi64(
        0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
        0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
        0x1919191919191919, 0x0b10151500000000);

    const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
    values = _mm512_srlv_epi32(values, shift);
  }

  return values;
}

simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
                                                  int &count) {
  const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
  const __m512i expand_ver2 = _mm512_setr_epi64(
      0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
      0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
      0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
  const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
  const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
  const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
  const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
  const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
  count = static_cast<int>(count_ones(leading_bytes));
  return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
                                    input);
}

simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
  __m512i char_class = _mm512_srli_epi32(input, 4);
  /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
  const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
  const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
  char_class =
      _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
  return expanded_utf8_to_utf32(char_class, input);
}
/* end file src/icelake/icelake_utf8_common.inl.cpp */

/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
// file included directly

simdutf_really_inline __m512i check_special_cases(__m512i input,
                                                  const __m512i prev1) {
  __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
                                    0x0202020202020202, 0x4915012180808080,
                                    0x0202020202020202, 0x4915012180808080,
                                    0x0202020202020202, 0x4915012180808080);
  const __m512i v_0f = _mm512_set1_epi8(0x0f);
  __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);

  __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
  __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
  __m512i index2 = _mm512_and_si512(prev1, v_0f);

  __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
  __m512i mask3 =
      _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
                        0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
                        0x101010101010101, 0x1010101babaaee6);
  __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
  __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
  return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
}

simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
                                                      const __m512i prev_input,
                                                      const __m512i sc) {
  __m512i prev2 = prev<2>(input, prev_input);
  __m512i prev3 = prev<3>(input, prev_input);
  __m512i is_third_byte = _mm512_subs_epu8(
      prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
  __m512i is_fourth_byte = _mm512_subs_epu8(
      prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
  __m512i is_third_or_fourth_byte =
      _mm512_or_si512(is_third_byte, is_fourth_byte);
  const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
  is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
  // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
  const __m512i v_80 = _mm512_set1_epi8(char(0x80));
  return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
                                   0b1101010);
  //__m512i is_third_or_fourth_byte_mask =
  //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
  // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
}
//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline __m512i is_incomplete(const __m512i input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
                                        0xffffffffffffffff, 0xffffffffffffffff,
                                        0xffffffffffffffff, 0xffffffffffffffff,
                                        0xffffffffffffffff, 0xbfdfefffffffffff);
  return _mm512_subs_epu8(input, max_value);
}

struct avx512_utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  __m512i error{};

  // The last input we received
  __m512i prev_input_block{};
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  __m512i prev_incomplete{};

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const __m512i input,
                                              const __m512i prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    __m512i prev1 = prev<1>(input, prev_input);
    __m512i sc = check_special_cases(input, prev1);
    this->error = _mm512_or_si512(
        check_multibyte_lengths(input, prev_input, sc), this->error);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error = _mm512_or_si512(this->error, this->prev_incomplete);
  }

  // returns true if ASCII.
  simdutf_really_inline bool check_next_input(const __m512i input) {
    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
    const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
    if (ascii == 0) {
      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
      return true;
    } else {
      this->check_utf8_bytes(input, this->prev_input_block);
      this->prev_incomplete = is_incomplete(input);
      this->prev_input_block = input;
      return false;
    }
  }
  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return _mm512_test_epi8_mask(this->error, this->error) != 0;
  }
}; // struct avx512_utf8_checker
/* end file src/icelake/icelake_utf8_validation.inl.cpp */

/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
// file included directly

// File contains conversion procedure from VALID UTF-8 strings.

/*
    valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.

    The `OUTPUT` template type decides what to do with UTF-32: store
    it directly or convert into UTF-16 (with AVX512).

    Input:
    - str           - valid UTF-8 string
    - len           - string length
    - out_buffer    - output buffer

    Result:
    - pair.first    - the first unprocessed input byte
    - pair.second   - the first unprocessed output word
*/
template <endianness big_endian, typename OUTPUT>
std::pair<const char *, OUTPUT *>
valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
  static_assert(
      UTF32 or UTF16,
      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
  static_assert(!(UTF32 and big_endian),
                "we do not currently support big-endian UTF-32");

  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  const char *ptr = str;
  const char *end = ptr + len;

  OUTPUT *output = dwords;
  /**
   * In the main loop, we consume 64 bytes per iteration,
   * but we access 64 + 4 bytes.
   * We check for ptr + 64 + 64 <= end because
   * we want to be do maskless writes without overruns.
   */
  while (end - ptr >= 64 + 4) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
    if (ascii == 0) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
      continue;
    }

    const __m512i lane0 = broadcast_epi128<0>(utf8);
    const __m512i lane1 = broadcast_epi128<1>(utf8);
    int valid_count0;
    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
    const __m512i lane2 = broadcast_epi128<2>(utf8);
    int valid_count1;
    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
    if (valid_count0 + valid_count1 <= 16) {
      vec0 = _mm512_mask_expand_epi32(
          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
      valid_count0 += valid_count1;
      vec0 = expand_utf8_to_utf32(vec0);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
    } else {
      vec0 = expand_utf8_to_utf32(vec0);
      vec1 = expand_utf8_to_utf32(vec1);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
    }
    const __m512i lane3 = broadcast_epi128<3>(utf8);
    int valid_count2;
    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
    uint32_t tmp1;
    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
    const __m512i lane4 = _mm512_set1_epi32(tmp1);
    int valid_count3;
    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
    if (valid_count2 + valid_count3 <= 16) {
      vec2 = _mm512_mask_expand_epi32(
          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
      valid_count2 += valid_count3;
      vec2 = expand_utf8_to_utf32(vec2);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
    } else {
      vec2 = expand_utf8_to_utf32(vec2);
      vec3 = expand_utf8_to_utf32(vec3);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
    }
    ptr += 4 * 16;
  }

  if (end - ptr >= 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
    if (ascii == 0) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
    } else {
      const __m512i lane0 = broadcast_epi128<0>(utf8);
      const __m512i lane1 = broadcast_epi128<1>(utf8);
      int valid_count0;
      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
      const __m512i lane2 = broadcast_epi128<2>(utf8);
      int valid_count1;
      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
      if (valid_count0 + valid_count1 <= 16) {
        vec0 = _mm512_mask_expand_epi32(
            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
        valid_count0 += valid_count1;
        vec0 = expand_utf8_to_utf32(vec0);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      } else {
        vec0 = expand_utf8_to_utf32(vec0);
        vec1 = expand_utf8_to_utf32(vec1);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
      }

      const __m512i lane3 = broadcast_epi128<3>(utf8);
      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)

      ptr += 3 * 16;
    }
  }
  return {ptr, output};
}

using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
/* begin file src/icelake/icelake_from_utf8.inl.cpp */
// file included directly

// File contains conversion procedure from possibly invalid UTF-8 strings.

template <endianness big_endian, typename OUTPUT>
// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
// is legacy.
std::pair<const char *, OUTPUT *>
validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
  static_assert(
      UTF32 or UTF16,
      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
  static_assert(!(UTF32 and big_endian),
                "we do not currently support big-endian UTF-32");

  const char *ptr = str;
  const char *end = ptr + len;
  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  OUTPUT *output = dwords;
  avx512_utf8_checker checker{};
  /**
   * In the main loop, we consume 64 bytes per iteration,
   * but we access 64 + 4 bytes.
   * We use masked writes to avoid overruns, see
   * https://github.com/simdutf/simdutf/issues/471
   */
  while (end - ptr >= 64 + 4) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    if (checker.check_next_input(utf8)) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
      continue;
    }
    const __m512i lane0 = broadcast_epi128<0>(utf8);
    const __m512i lane1 = broadcast_epi128<1>(utf8);
    int valid_count0;
    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
    const __m512i lane2 = broadcast_epi128<2>(utf8);
    int valid_count1;
    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
    if (valid_count0 + valid_count1 <= 16) {
      vec0 = _mm512_mask_expand_epi32(
          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
      valid_count0 += valid_count1;
      vec0 = expand_utf8_to_utf32(vec0);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
    } else {
      vec0 = expand_utf8_to_utf32(vec0);
      vec1 = expand_utf8_to_utf32(vec1);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
    }
    const __m512i lane3 = broadcast_epi128<3>(utf8);
    int valid_count2;
    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
    uint32_t tmp1;
    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
    const __m512i lane4 = _mm512_set1_epi32(tmp1);
    int valid_count3;
    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
    if (valid_count2 + valid_count3 <= 16) {
      vec2 = _mm512_mask_expand_epi32(
          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
      valid_count2 += valid_count3;
      vec2 = expand_utf8_to_utf32(vec2);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
    } else {
      vec2 = expand_utf8_to_utf32(vec2);
      vec3 = expand_utf8_to_utf32(vec3);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
    }
    ptr += 4 * 16;
  }
  const char *validatedptr = ptr; // validated up to ptr

  // For the final pass, we validate 64 bytes, but we only transcode
  // 3*16 bytes, so we may end up double-validating 16 bytes.
  if (end - ptr >= 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    if (checker.check_next_input(utf8)) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
    } else {
      const __m512i lane0 = broadcast_epi128<0>(utf8);
      const __m512i lane1 = broadcast_epi128<1>(utf8);
      int valid_count0;
      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
      const __m512i lane2 = broadcast_epi128<2>(utf8);
      int valid_count1;
      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
      if (valid_count0 + valid_count1 <= 16) {
        vec0 = _mm512_mask_expand_epi32(
            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
        valid_count0 += valid_count1;
        vec0 = expand_utf8_to_utf32(vec0);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      } else {
        vec0 = expand_utf8_to_utf32(vec0);
        vec1 = expand_utf8_to_utf32(vec1);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
      }

      const __m512i lane3 = broadcast_epi128<3>(utf8);
      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)

      ptr += 3 * 16;
    }
    validatedptr += 4 * 16;
  }
  if (end != validatedptr) {
    const __m512i utf8 =
        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
                                (const __m512i *)validatedptr);
    checker.check_next_input(utf8);
  }
  checker.check_eof();
  if (checker.errors()) {
    return {ptr, nullptr}; // We found an error.
  }
  return {ptr, output};
}

// Like validating_utf8_to_fixed_length but returns as soon as an error is
// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
// This code is legacy.
template <endianness big_endian, typename OUTPUT>
std::tuple<const char *, OUTPUT *, bool>
validating_utf8_to_fixed_length_with_constant_checks(const char *str,
                                                     size_t len,
                                                     OUTPUT *dwords) {
  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
  static_assert(
      UTF32 or UTF16,
      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
  static_assert(!(UTF32 and big_endian),
                "we do not currently support big-endian UTF-32");

  const char *ptr = str;
  const char *end = ptr + len;
  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
  OUTPUT *output = dwords;
  avx512_utf8_checker checker{};
  /**
   * In the main loop, we consume 64 bytes per iteration,
   * but we access 64 + 4 bytes.
   */
  while (end - ptr >= 4 + 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    bool ascii = checker.check_next_input(utf8);
    if (checker.errors()) {
      return {ptr, output, false}; // We found an error.
    }
    if (ascii) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
      continue;
    }
    const __m512i lane0 = broadcast_epi128<0>(utf8);
    const __m512i lane1 = broadcast_epi128<1>(utf8);
    int valid_count0;
    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
    const __m512i lane2 = broadcast_epi128<2>(utf8);
    int valid_count1;
    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
    if (valid_count0 + valid_count1 <= 16) {
      vec0 = _mm512_mask_expand_epi32(
          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
      valid_count0 += valid_count1;
      vec0 = expand_utf8_to_utf32(vec0);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
    } else {
      vec0 = expand_utf8_to_utf32(vec0);
      vec1 = expand_utf8_to_utf32(vec1);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
    }
    const __m512i lane3 = broadcast_epi128<3>(utf8);
    int valid_count2;
    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
    uint32_t tmp1;
    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
    const __m512i lane4 = _mm512_set1_epi32(tmp1);
    int valid_count3;
    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
    if (valid_count2 + valid_count3 <= 16) {
      vec2 = _mm512_mask_expand_epi32(
          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
      valid_count2 += valid_count3;
      vec2 = expand_utf8_to_utf32(vec2);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
    } else {
      vec2 = expand_utf8_to_utf32(vec2);
      vec3 = expand_utf8_to_utf32(vec3);
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
    }
    ptr += 4 * 16;
  }
  const char *validatedptr = ptr; // validated up to ptr

  // For the final pass, we validate 64 bytes, but we only transcode
  // 3*16 bytes, so we may end up double-validating 16 bytes.
  if (end - ptr >= 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    bool ascii = checker.check_next_input(utf8);
    if (checker.errors()) {
      return {ptr, output, false}; // We found an error.
    }
    if (ascii) {
      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
      output += 64;
      ptr += 64;
    } else {
      const __m512i lane0 = broadcast_epi128<0>(utf8);
      const __m512i lane1 = broadcast_epi128<1>(utf8);
      int valid_count0;
      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
      const __m512i lane2 = broadcast_epi128<2>(utf8);
      int valid_count1;
      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
      if (valid_count0 + valid_count1 <= 16) {
        vec0 = _mm512_mask_expand_epi32(
            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
        valid_count0 += valid_count1;
        vec0 = expand_utf8_to_utf32(vec0);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
      } else {
        vec0 = expand_utf8_to_utf32(vec0);
        vec1 = expand_utf8_to_utf32(vec1);
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
      }

      const __m512i lane3 = broadcast_epi128<3>(utf8);
      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)

      ptr += 3 * 16;
    }
    validatedptr += 4 * 16;
  }
  if (end != validatedptr) {
    const __m512i utf8 =
        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
                                (const __m512i *)validatedptr);
    checker.check_next_input(utf8);
  }
  checker.check_eof();
  if (checker.errors()) {
    return {ptr, output, false}; // We found an error.
  }
  return {ptr, output, true};
}
/* end file src/icelake/icelake_from_utf8.inl.cpp */

/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
// file included directly

// File contains conversion procedure from possibly invalid UTF-8 strings.

template <bool is_remaining>
simdutf_really_inline size_t process_block_from_utf8_to_latin1(
    const char *buf, size_t len, char *latin_output, __m512i minus64,
    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
  __mmask64 load_mask =
      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
  __mmask64 nonascii = _mm512_movepi8_mask(input);
  if (nonascii == 0) {
    if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
      return 0;              // Indicates error
    }
    is_remaining
        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
        : _mm512_storeu_si512((__m512i *)latin_output, input);
    return len;
  }

  const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);

  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
  __mmask64 invalid_leading_bytes =
      _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);

  if (invalid_leading_bytes) {
    return 0; // Indicates error
  }

  __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;

  if ((nonascii ^ leading) != leading_shift) {
    return 0; // Indicates error
  }

  const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
  input =
      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);

  __mmask64 retain = ~leading & load_mask;
  __m512i output = _mm512_maskz_compress_epi8(retain, input);
  int64_t written_out = count_ones(retain);
  if (written_out == 0) {
    return 0; // Indicates error
  }
  *next_bit6_ptr = bit6 >> 63;
  *next_leading_ptr = leading >> 63;

  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);

  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);

  return written_out;
}

size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
                             char *&inlatin_output) {
  const char *buf = inbuf;
  char *latin_output = inlatin_output;
  char *start = latin_output;
  size_t pos = 0;
  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
  __m512i one = _mm512_set1_epi8(1);
  __mmask64 next_leading = 0;
  __mmask64 next_bit6 = 0;

  while (pos + 64 <= len) {
    size_t written = process_block_from_utf8_to_latin1<false>(
        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
    if (written == 0) {
      inlatin_output = latin_output;
      inbuf = buf + pos - next_leading;
      return 0; // Indicates error at pos or after, or just before pos (too
                // short error)
    }
    latin_output += written;
    pos += 64;
  }

  if (pos < len) {
    size_t remaining = len - pos;
    size_t written = process_block_from_utf8_to_latin1<true>(
        buf + pos, remaining, latin_output, minus64, one, &next_leading,
        &next_bit6);
    if (written == 0) {
      inbuf = buf + pos - next_leading;
      inlatin_output = latin_output;
      return 0; // Indicates error at pos or after, or just before pos (too
                // short error)
    }
    latin_output += written;
  }
  if (next_leading) {
    inbuf = buf + len - next_leading;
    inlatin_output = latin_output;
    return 0; // Indicates error at end of buffer
  }
  inlatin_output = latin_output;
  inbuf += len;
  return size_t(latin_output - start);
}
/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
// file included directly

// File contains conversion procedure from valid UTF-8 strings.

template <bool is_remaining>
simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
    const char *buf, size_t len, char *latin_output, __m512i minus64,
    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
  __mmask64 load_mask =
      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
  __mmask64 nonascii = _mm512_movepi8_mask(input);

  if (nonascii == 0) {
    is_remaining
        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
        : _mm512_storeu_si512((__m512i *)latin_output, input);
    return len;
  }

  __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);

  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));

  *next_leading_ptr = leading >> 63;

  __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
  input =
      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
  *next_bit6_ptr = bit6 >> 63;

  __mmask64 retain = ~leading & load_mask;
  __m512i output = _mm512_maskz_compress_epi8(retain, input);
  int64_t written_out = count_ones(retain);
  if (written_out == 0) {
    return 0; // Indicates error
  }
  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
  // Optimization opportunity: sometimes, masked writes are not needed.
  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
  return written_out;
}

size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
                                   char *latin_output) {
  char *start = latin_output;
  size_t pos = 0;
  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
  __m512i one = _mm512_set1_epi8(1);
  __mmask64 next_leading = 0;
  __mmask64 next_bit6 = 0;

  while (pos + 64 <= len) {
    size_t written = process_valid_block_from_utf8_to_latin1<false>(
        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
    latin_output += written;
    pos += 64;
  }

  if (pos < len) {
    size_t remaining = len - pos;
    size_t written = process_valid_block_from_utf8_to_latin1<true>(
        buf + pos, remaining, latin_output, minus64, one, &next_leading,
        &next_bit6);
    latin_output += written;
  }

  return (size_t)(latin_output - start);
}
/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */

/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
// file included directly
size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
                                       char *latin1_output) {
  const char32_t *end = buf + len;
  __m512i v_0xFF = _mm512_set1_epi32(0xff);
  __m512i shufmask = _mm512_set_epi8(
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
  while (end - buf >= 16) {
    __m512i in = _mm512_loadu_si512((__m512i *)buf);
    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
      return 0;
    }
    _mm_storeu_si128(
        (__m128i *)latin1_output,
        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
    latin1_output += 16;
    buf += 16;
  }
  if (buf < end) {
    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
      return 0;
    }
    _mm_mask_storeu_epi8(
        latin1_output, mask,
        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
  }
  return len;
}

std::pair<result, char *>
icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                            char *latin1_output) {
  const char32_t *end = buf + len;
  const char32_t *start = buf;
  __m512i v_0xFF = _mm512_set1_epi32(0xff);
  __m512i shufmask = _mm512_set_epi8(
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
  while (end - buf >= 16) {
    __m512i in = _mm512_loadu_si512((__m512i *)buf);
    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
      while (uint32_t(*buf) <= 0xff) {
        *latin1_output++ = uint8_t(*buf++);
      }
      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                            latin1_output);
    }
    _mm_storeu_si128(
        (__m128i *)latin1_output,
        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
    latin1_output += 16;
    buf += 16;
  }
  if (buf < end) {
    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
      while (uint32_t(*buf) <= 0xff) {
        *latin1_output++ = uint8_t(*buf++);
      }
      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                            latin1_output);
    }
    _mm_mask_storeu_epi8(
        latin1_output, mask,
        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
  }
  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
}
/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */

/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
// file included directly

// Todo: currently, this is just the haswell code, optimize for icelake kernel.
std::pair<const char32_t *, char *>
avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
                             char *utf8_output) {
  const char32_t *end = buf + len;
  const __m256i v_0000 = _mm256_setzero_si256();
  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
  __m256i running_max = _mm256_setzero_si256();
  __m256i forbidden_bytemask = _mm256_setzero_si256();

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
    __m256i in = _mm256_loadu_si256((__m256i *)buf);
    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
                                        _mm256_and_si256(nextin, v_7fffffff));
    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);

    // Try to apply UTF-16 => UTF-8 routine on 256 bits
    // (haswell/avx2_convert_utf16_to_utf8.cpp)

    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
      // 1. pack the bytes
      const __m128i utf8_packed = _mm_packus_epi16(
          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
      // 2. store (16 bytes)
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
      // 3. adjust pointers
      buf += 16;
      utf8_output += 16;
      continue; // we are done for this round!
    }
    // no bits set above 7th bit
    const __m256i one_byte_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
    const uint32_t one_byte_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));

    // no bits set above 11th bit
    const __m256i one_or_two_bytes_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
    const uint32_t one_or_two_bytes_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
    if (one_or_two_bytes_bitmask == 0xffffffff) {
      // 1. prepare 2-byte values
      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
      // expected output   : [110a|aaaa|10bb|bbbb] x 8
      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);

      // t0 = [000a|aaaa|bbbb|bb00]
      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
      // t1 = [000a|aaaa|0000|0000]
      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
      // t2 = [0000|0000|00bb|bbbb]
      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
      // t3 = [000a|aaaa|00bb|bbbb]
      const __m256i t3 = _mm256_or_si256(t1, t2);
      // t4 = [110a|aaaa|10bb|bbbb]
      const __m256i t4 = _mm256_or_si256(t3, v_c080);

      // 2. merge ASCII and 2-byte codewords
      const __m256i utf8_unpacked =
          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);

      // 3. prepare bitmask for 8-bit lookup
      const uint32_t M0 = one_byte_bitmask & 0x55555555;
      const uint32_t M1 = M0 >> 7;
      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
      // 4. pack the bytes

      const uint8_t *row =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
      const uint8_t *row_2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
                                                                       16)][0];

      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));

      const __m256i utf8_packed = _mm256_shuffle_epi8(
          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
      // 5. store bytes
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_castsi256_si128(utf8_packed));
      utf8_output += row[0];
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_extractf128_si256(utf8_packed, 1));
      utf8_output += row_2[0];

      // 6. adjust pointers
      buf += 16;
      continue;
    }
    // Must check for overflow in packing
    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
    const uint32_t saturation_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
    if (saturation_bitmask == 0xffffffff) {
      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
      forbidden_bytemask = _mm256_or_si256(
          forbidden_bytemask,
          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));

      const __m256i dup_even = _mm256_setr_epi16(
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);

      /* In this branch we handle three cases:
        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
        single UFT-8 byte
        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
        UTF-8 bytes
        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
        three UTF-8 bytes

        We expand the input word (16-bit) into two code units (32-bit), thus
        we have room for four bytes. However, we need five distinct bit
        layouts. Note that the last byte in cases #2 and #3 is the same.

        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
        in register t2.

        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
        either byte 1 for case #2 or byte 2 for case #3. Note that they
        differ by exactly one bit.

        Finally from these two code units we build proper UTF-8 sequence, taking
        into account the case (i.e, the number of bytes to write).
      */
      /**
       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
       * t2 => [0ccc|cccc] [10cc|cccc]
       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
       */
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));

      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
                                             simdutf_vec(0b0100000000000000));
      const __m256i s4 = _mm256_xor_si256(s3, m0);
#undef simdutf_vec

      // 4. expand code units 16-bit => 32-bit
      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);

      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
      // Due to the wider registers, the following path is less likely to be
      // useful.
      /*if(mask == 0) {
        // We only have three-byte code units. Use fast path.
        const __m256i shuffle =
      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
      _mm256_shuffle_epi8(out1, shuffle);
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
        continue;
      }*/
      const uint8_t mask0 = uint8_t(mask);
      const uint8_t *row0 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
      const __m128i utf8_0 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);

      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
      const uint8_t *row1 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
      const __m128i utf8_1 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);

      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
      const uint8_t *row2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
      const __m128i utf8_2 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);

      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
      const uint8_t *row3 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
      const __m128i utf8_3 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);

      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
      utf8_output += row0[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
      utf8_output += row1[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
      utf8_output += row2[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
      utf8_output += row3[0];
      buf += 16;
    } else {
      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD may require
      // large, non-trivial tables?
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(nullptr, utf8_output);
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else { // 4-byte
          if (word > 0x10FFFF) {
            return std::make_pair(nullptr, utf8_output);
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  // check for invalid input
  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
    return std::make_pair(nullptr, utf8_output);
  }

  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
    return std::make_pair(nullptr, utf8_output);
  }

  return std::make_pair(buf, utf8_output);
}

// Todo: currently, this is just the haswell code, optimize for icelake kernel.
std::pair<result, char *>
avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
                                         char *utf8_output) {
  const char32_t *end = buf + len;
  const char32_t *start = buf;

  const __m256i v_0000 = _mm256_setzero_si256();
  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
    __m256i in = _mm256_loadu_si256((__m256i *)buf);
    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
    // Check for too large input
    const __m256i max_input =
        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
    if (static_cast<uint32_t>(_mm256_movemask_epi8(
            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                            utf8_output);
    }

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
                                        _mm256_and_si256(nextin, v_7fffffff));
    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);

    // Try to apply UTF-16 => UTF-8 routine on 256 bits
    // (haswell/avx2_convert_utf16_to_utf8.cpp)

    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
      // 1. pack the bytes
      const __m128i utf8_packed = _mm_packus_epi16(
          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
      // 2. store (16 bytes)
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
      // 3. adjust pointers
      buf += 16;
      utf8_output += 16;
      continue; // we are done for this round!
    }
    // no bits set above 7th bit
    const __m256i one_byte_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
    const uint32_t one_byte_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));

    // no bits set above 11th bit
    const __m256i one_or_two_bytes_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
    const uint32_t one_or_two_bytes_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
    if (one_or_two_bytes_bitmask == 0xffffffff) {
      // 1. prepare 2-byte values
      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
      // expected output   : [110a|aaaa|10bb|bbbb] x 8
      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);

      // t0 = [000a|aaaa|bbbb|bb00]
      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
      // t1 = [000a|aaaa|0000|0000]
      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
      // t2 = [0000|0000|00bb|bbbb]
      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
      // t3 = [000a|aaaa|00bb|bbbb]
      const __m256i t3 = _mm256_or_si256(t1, t2);
      // t4 = [110a|aaaa|10bb|bbbb]
      const __m256i t4 = _mm256_or_si256(t3, v_c080);

      // 2. merge ASCII and 2-byte codewords
      const __m256i utf8_unpacked =
          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);

      // 3. prepare bitmask for 8-bit lookup
      const uint32_t M0 = one_byte_bitmask & 0x55555555;
      const uint32_t M1 = M0 >> 7;
      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
      // 4. pack the bytes

      const uint8_t *row =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
      const uint8_t *row_2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
                                                                       16)][0];

      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));

      const __m256i utf8_packed = _mm256_shuffle_epi8(
          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
      // 5. store bytes
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_castsi256_si128(utf8_packed));
      utf8_output += row[0];
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_extractf128_si256(utf8_packed, 1));
      utf8_output += row_2[0];

      // 6. adjust pointers
      buf += 16;
      continue;
    }
    // Must check for overflow in packing
    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
    const uint32_t saturation_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
    if (saturation_bitmask == 0xffffffff) {
      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes

      // Check for illegal surrogate code units
      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
      const __m256i forbidden_bytemask =
          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
          0x0) {
        return std::make_pair(result(error_code::SURROGATE, buf - start),
                              utf8_output);
      }

      const __m256i dup_even = _mm256_setr_epi16(
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);

      /* In this branch we handle three cases:
        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
        single UFT-8 byte
        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
        UTF-8 bytes
        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
        three UTF-8 bytes

        We expand the input word (16-bit) into two code units (32-bit), thus
        we have room for four bytes. However, we need five distinct bit
        layouts. Note that the last byte in cases #2 and #3 is the same.

        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
        in register t2.

        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
        either byte 1 for case #2 or byte 2 for case #3. Note that they
        differ by exactly one bit.

        Finally from these two code units we build proper UTF-8 sequence, taking
        into account the case (i.e, the number of bytes to write).
      */
      /**
       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
       * t2 => [0ccc|cccc] [10cc|cccc]
       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
       */
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));

      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
                                             simdutf_vec(0b0100000000000000));
      const __m256i s4 = _mm256_xor_si256(s3, m0);
#undef simdutf_vec

      // 4. expand code units 16-bit => 32-bit
      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);

      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
      // Due to the wider registers, the following path is less likely to be
      // useful.
      /*if(mask == 0) {
        // We only have three-byte code units. Use fast path.
        const __m256i shuffle =
      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
      _mm256_shuffle_epi8(out1, shuffle);
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
        continue;
      }*/
      const uint8_t mask0 = uint8_t(mask);
      const uint8_t *row0 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
      const __m128i utf8_0 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);

      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
      const uint8_t *row1 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
      const __m128i utf8_1 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);

      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
      const uint8_t *row2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
      const __m128i utf8_2 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);

      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
      const uint8_t *row3 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
      const __m128i utf8_3 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);

      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
      utf8_output += row0[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
      utf8_output += row1[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
      utf8_output += row2[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
      utf8_output += row3[0];
      buf += 16;
    } else {
      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD may require
      // large, non-trivial tables?
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(
                result(error_code::SURROGATE, buf - start + k), utf8_output);
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else { // 4-byte
          if (word > 0x10FFFF) {
            return std::make_pair(
                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
}
/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */

/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
// file included directly

bool validate_ascii(const char *buf, size_t len) {
  const char *end = buf + len;
  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
  __m512i running_or = _mm512_setzero_si512();
  for (; end - buf >= 64; buf += 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
                                           0xf8); // running_or | (utf8 & ascii)
  }
  if (buf < end) {
    const __m512i utf8 = _mm512_maskz_loadu_epi8(
        (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
                                           0xf8); // running_or | (utf8 & ascii)
  }
  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
}
/* end file src/icelake/icelake_ascii_validation.inl.cpp */
/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
// file included directly

bool validate_utf32(const char32_t *buf, size_t len) {
  if (simdutf_unlikely(len == 0)) {
    return true;
  }
  const char32_t *end = buf + len;

  const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
  __m512i currentmax = _mm512_setzero_si512();
  __m512i currentoffsetmax = _mm512_setzero_si512();

  // Optimized: Process 32 values (2x 512-bit) per iteration for better
  // throughput
  while (end - buf >= 32) {
    __m512i utf32_1 = _mm512_loadu_si512((const __m512i *)buf);
    __m512i utf32_2 = _mm512_loadu_si512((const __m512i *)(buf + 16));
    buf += 32;

    // Process both blocks in parallel to maximize instruction-level parallelism
    __m512i offsetmax_1 = _mm512_add_epi32(utf32_1, offset);
    __m512i offsetmax_2 = _mm512_add_epi32(utf32_2, offset);

    currentoffsetmax = _mm512_max_epu32(offsetmax_1, currentoffsetmax);
    currentmax = _mm512_max_epu32(utf32_1, currentmax);

    currentoffsetmax = _mm512_max_epu32(offsetmax_2, currentoffsetmax);
    currentmax = _mm512_max_epu32(utf32_2, currentmax);
  }

  // Handle remaining 16-31 values
  if (end - buf >= 16) {
    __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
    buf += 16;
    currentoffsetmax =
        _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
    currentmax = _mm512_max_epu32(utf32, currentmax);
  }

  // Handle remaining 0-15 values with masked load
  if (buf < end) {
    __m512i utf32 =
        _mm512_maskz_loadu_epi32(__mmask16((1 << (end - buf)) - 1), buf);
    currentoffsetmax =
        _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
    currentmax = _mm512_max_epu32(utf32, currentmax);
  }

  const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
  const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
  const auto outside_range = _mm512_cmpgt_epu32_mask(currentmax, standardmax);
  if (outside_range != 0) {
    return false;
  }

  const auto surrogate =
      _mm512_cmpgt_epu32_mask(currentoffsetmax, standardoffsetmax);
  if (surrogate != 0) {
    return false;
  }

  return true;
}
/* end file src/icelake/icelake_utf32_validation.inl.cpp */
/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
// file included directly

static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
                                               char *utf8_output,
                                               int mask_output) {
  __mmask64 nonascii = _mm512_movepi8_mask(input);
  size_t output_size = input_len + (size_t)count_ones(nonascii);

  // Mask to denote whether the byte is a leading byte that is not ascii
  __mmask64 sixth = _mm512_cmpge_epu8_mask(
      input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000

  const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
  uint64_t ascii = ~nonascii;
  // the bits in ascii are inverted and zeros are interspersed in between them
  uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
  uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);

  // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
  __m512i input_interleaved = _mm512_permutexvar_epi8(
      _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
                       0x37173616, 0x35153414, 0x33133212, 0x31113010,
                       0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
                       0x27072606, 0x25052404, 0x23032202, 0x21012000),
      input);

  // double size of each byte, and insert the leading byte 1100 0010

  /*
  upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
  process. We adjust for the bytes that have their two most significant bits.
  This takes care of the first 32 bytes, assuming we interleaved the bytes. */
  __m512i outputA =
      _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
  outputA = _mm512_mask_add_epi16(
      outputA, (__mmask32)sixth, outputA,
      _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????

  // in the second 32-bit half, set first or second option based on whether
  // original input is leading byte (second case) or not (first case)
  __m512i leadingB =
      _mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
                              _mm512_set1_epi16(0x00c2),  // 0000 0000 1101 0010
                              _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
  __m512i outputB = _mm512_ternarylogic_epi32(
      input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
      (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB

  // prune redundant bytes
  outputA = _mm512_maskz_compress_epi8(maskA, outputA);
  outputB = _mm512_maskz_compress_epi8(maskB, outputB);

  size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;

  if (mask_output) {
    if (input_len > 32) { // is the second half of the input vector used?
      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
      utf8_output += output_sizeA;
      write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
    } else {
      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
    }
  } else {
    _mm512_storeu_si512(utf8_output, outputA);
    utf8_output += output_sizeA;
    _mm512_storeu_si512(utf8_output, outputB);
  }
  return output_size;
}

static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
                                                  char *utf8_output) {
  __mmask64 nonascii = _mm512_movepi8_mask(input);
  if (nonascii) {
    return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
  } else {
    _mm512_storeu_si512(utf8_output, input);
    return 64;
  }
}

size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
                                   char *utf8_output) {
  char *start = utf8_output;
  size_t pos = 0;
  // if there's at least 128 bytes remaining, we don't need to mask the output
  for (; pos + 128 <= len; pos += 64) {
    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
    utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
  }
  // in the last 128 bytes, the first 64 may require masking the output
  if (pos + 64 <= len) {
    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
    utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
    pos += 64;
  }
  // with the last 64 bytes, the input also needs to be masked
  if (pos < len) {
    __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
    __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
    utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
  }
  return (size_t)(utf8_output - start);
}
/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
void avx512_convert_latin1_to_utf32(const char *buf, size_t len,
                                    char32_t *utf32_output) {
  while (len >= 16) {
    // Load 16 Latin1 characters into a 128-bit register
    __m128i in = _mm_loadu_si128((__m128i *)buf);

    // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
    // vpmovzxbd
    __m512i out = _mm512_cvtepu8_epi32(in);

    // Store the results back to memory
    _mm512_storeu_si512((__m512i *)utf32_output, out);

    len -= 16;
    buf += 16;
    utf32_output += 16;
  }

  __mmask16 mask = __mmask16((1 << len) - 1);
  __m128i in = _mm_maskz_loadu_epi8(mask, buf);
  __m512i out = _mm512_cvtepu8_epi32(in);
  _mm512_mask_storeu_epi32((__m512i *)utf32_output, mask, out);
}
/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
/* begin file src/icelake/icelake_base64.inl.cpp */
// file included directly
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */

struct block64 {
  __m512i chunks[1];
};

template <bool base64_url, bool use_lines>
size_t encode_base64_impl(char *dst, const char *src, size_t srclen,
                          base64_options options,
                          size_t line_length = simdutf::default_line_length) {
  size_t offset = 0;
  if (line_length < 4) {
    line_length = 4; // We do not support line_length less than 4
  }
  // credit: Wojciech Muła
  const uint8_t *input = (const uint8_t *)src;

  uint8_t *out = (uint8_t *)dst;
  static const char *lookup_tbl =
      base64_url
          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  const __m512i shuffle_input = _mm512_setr_epi32(
      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
  const __m512i lookup =
      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
  size_t size = srclen;
  __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
  // We want that input == end_input means that we must stop.
  const uint8_t *end_input = input + (size - (size % 48));
  while (input != end_input) {
    const __m512i v = _mm512_maskz_loadu_epi8(
        input_mask, reinterpret_cast<const __m512i *>(input));
    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
    if (use_lines) {
      if (offset + 64 > line_length) {
        if (line_length >= 64) {
          __m512i expanded = _mm512_mask_expand_epi8(
              _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))),
              result);
          _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded);
          __m128i last_lane =
              _mm512_extracti32x4_epi32(result, 3); // Lane 3 (bytes 48-63)
          uint8_t last_byte =
              static_cast<uint8_t>(_mm_extract_epi8(last_lane, 15));
          out[64] = last_byte;
          out += 65;
          offset = 64 - (line_length - offset);
        } else { // slow path
          alignas(64) uint8_t local_buffer[64];
          _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer),
                              result);
          size_t out_pos = 0;
          size_t local_offset = offset;
          for (size_t j = 0; j < 64;) {
            if (local_offset == line_length) {
              out[out_pos++] = '\n';
              local_offset = 0;
            }
            out[out_pos++] = local_buffer[j++];
            local_offset++;
          }
          offset = local_offset;
          out += out_pos;
        }
      } else {
        _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
        offset += 64;
        out += 64;
      }
    } else {
      _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
      out += 64;
    }
    input += 48;
  }
  size = size % 48;

  input_mask = ((__mmask64)1 << size) - 1;
  const __m512i v = _mm512_maskz_loadu_epi8(
      input_mask, reinterpret_cast<const __m512i *>(input));
  const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
  const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
  bool padding_needed =
      (((options & base64_url) == 0) ^
       ((options & base64_reverse_padding) == base64_reverse_padding));
  size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
  size_t output_len = ((size + 2) / 3) * 4;
  size_t non_padded_output_len = output_len - padding_amount;
  if (!padding_needed) {
    output_len = non_padded_output_len;
  }
  // If no output, we are done.
  if (output_len == 0) {
    return (size_t)(out - (uint8_t *)dst);
  }
  __mmask64 output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len);
  __m512i result = _mm512_mask_permutexvar_epi8(
      _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
      indices, lookup);
  if (use_lines) {
    if (offset + output_len > line_length) {
      if (line_length >= 64) {
        __m512i expanded = _mm512_mask_expand_epi8(
            _mm512_set1_epi8('\n'), ~(1ULL << ((line_length - offset))),
            result);
        if (output_len == 64) {
          _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), expanded);
          out += 64;
          _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out - 63),
                                  1ULL << 63, result);
          out++;
        } else {
          output_mask = 0xFFFFFFFFFFFFFFFF >> (64 - output_len - 1);
          _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
                                  expanded);
          out += output_len + 1;
        }
      } else {
        alignas(64) uint8_t local_buffer[64];
        _mm512_storeu_si512(reinterpret_cast<__m512i *>(local_buffer), result);
        size_t out_pos = 0;
        size_t local_offset = offset;
        for (size_t j = 0; j < output_len;) {
          if (local_offset == line_length) {
            out[out_pos++] = '\n';
            local_offset = 0;
          }
          out[out_pos++] = local_buffer[j++];
          local_offset++;
        }
        offset = local_offset;
        out += out_pos;
      }
    } else {
      _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
                              result);
      out += output_len;
    }
  } else {
    _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
                            result);
    out += output_len;
  }
  return (size_t)(out - (uint8_t *)dst);
}

template <bool base64_url>
size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {
  return encode_base64_impl<base64_url, false>(dst, src, srclen, options);
}

template <bool base64_url, bool ignore_garbage, bool default_or_url>
static inline uint64_t to_base64_mask(block64 *b, uint64_t *error,
                                      uint64_t input_mask = UINT64_MAX) {
  __m512i input = b->chunks[0];
  const __m512i ascii_space_tbl = _mm512_set_epi8(
      0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
      9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
      0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
  __m512i lookup0;
  if (default_or_url) {
    lookup0 = _mm512_set_epi8(
        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
        52, 63, -128, 62, -128, 62, -128, -128, -128, -128, -128, -128, -128,
        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
  } else if (base64_url) {
    lookup0 = _mm512_set_epi8(
        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
  } else {
    lookup0 = _mm512_set_epi8(
        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
  }
  __m512i lookup1;
  if (default_or_url) {
    lookup1 = _mm512_set_epi8(
        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
  } else if (base64_url) {
    lookup1 = _mm512_set_epi8(
        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
  } else {
    lookup1 = _mm512_set_epi8(
        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
  }

  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
  const __m512i combined = _mm512_or_si512(translated, input);
  const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask;
  if (!ignore_garbage && mask) {
    const __mmask64 spaces =
        _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input),
                               input) &
        input_mask;
    *error = (mask ^ spaces);
  }
  b->chunks[0] = translated;

  return mask | (~input_mask);
}

static inline void copy_block(block64 *b, char *output) {
  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
}

static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
  uint64_t nmask = ~mask;
  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
  return _mm_popcnt_u64(nmask);
}

// The caller of this function is responsible to ensure that there are 64 bytes
// available from reading at src. The data is read into a block64 structure.
static inline void load_block(block64 *b, const char *src) {
  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
}

static inline void load_block_partial(block64 *b, const char *src,
                                      __mmask64 input_mask) {
  b->chunks[0] = _mm512_maskz_loadu_epi8(
      input_mask, reinterpret_cast<const __m512i *>(src));
}

// The caller of this function is responsible to ensure that there are 128 bytes
// available from reading at src. The data is read into a block64 structure.
static inline void load_block(block64 *b, const char16_t *src) {
  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
  __m512i p = _mm512_packus_epi16(m1, m2);
  b->chunks[0] =
      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
}

static inline void load_block_partial(block64 *b, const char16_t *src,
                                      __mmask64 input_mask) {
  __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask,
                                        reinterpret_cast<const __m512i *>(src));
  __m512i m2 =
      _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32),
                               reinterpret_cast<const __m512i *>(src + 32));
  __m512i p = _mm512_packus_epi16(m1, m2);
  b->chunks[0] =
      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
}

static inline void base64_decode(char *out, __m512i str) {
  const __m512i merge_ab_and_bc =
      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
  const __m512i merged =
      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
  const __m512i pack = _mm512_set_epi8(
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
      5, 6, 0, 1, 2);
  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
  _mm512_mask_storeu_epi8(
      (__m512i *)out, 0xffffffffffff,
      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
}
// decode 64 bytes and output 48 bytes
static inline void base64_decode_block(char *out, const char *src) {
  base64_decode(out,
                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
}
static inline void base64_decode_block(char *out, block64 *b) {
  base64_decode(out, b->chunks[0]);
}

template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename chartype>
full_result
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  (void)options;
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t padding_characters = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && padding_characters > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0, true};
    }
    return {SUCCESS, full_input_length, 0};
  }
  const chartype *const srcinit = src;
  const char *const dstinit = dst;
  const chartype *const srcend = src + srclen;

  // figure out why block_size == 2 is sometimes best???
  constexpr size_t block_size = 6;
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const chartype *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b;
      load_block(&b, src);
      src += 64;
      uint64_t error = 0;
      uint64_t badcharmask =
          to_base64_mask<base64_url, ignore_garbage, default_or_url>(&b,
                                                                     &error);
      if (!ignore_garbage && error) {
        src -= 64;
        size_t error_offset = _tzcnt_u64(error);
        return {error_code::INVALID_BASE64_CHARACTER,
                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
      }
      if (badcharmask != 0) {
        // optimization opportunity: check for simple masks like those made of
        // continuous 1s followed by continuous 0s. And masks containing a
        // single bad character.
        bufferptr += compress_block(&b, badcharmask, bufferptr);
      } else if (bufferptr != buffer) {
        copy_block(&b, bufferptr);
        bufferptr += 64;
      } else {
        base64_decode_block(dst, &b);
        dst += 48;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 1); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }

  int last_block_len = (int)(srcend - src);
  if (last_block_len != 0) {
    __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1;
    block64 b;
    load_block_partial(&b, src, input_mask);
    uint64_t error = 0;
    uint64_t badcharmask =
        to_base64_mask<base64_url, ignore_garbage, default_or_url>(&b, &error,
                                                                   input_mask);
    if (!ignore_garbage && error) {
      size_t error_offset = _tzcnt_u64(error);
      return {error_code::INVALID_BASE64_CHARACTER,
              size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
    }
    src += last_block_len;
    bufferptr += compress_block(&b, badcharmask, bufferptr);
  }

  char *buffer_start = buffer;
  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    base64_decode_block(dst, buffer_start);
    dst += 48;
  }
  if ((bufferptr - buffer_start) != 0) {
    // For efficiency reasons, we end up reproducing much of the code
    // in base64_tail_decode_impl. Better engineering would be to
    // refactor the code so that we can call it without a performance hit.
    size_t rem = (bufferptr - buffer_start);
    int idx = rem % 4;
    __mmask64 mask = ((__mmask64)1 << rem) - 1;
    __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start);
    size_t output_len = (rem / 4) * 3;
    __mmask64 output_mask = mask >> (rem - output_len);
    const __m512i merge_ab_and_bc =
        _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140));
    const __m512i merged =
        _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
    const __m512i pack = _mm512_set_epi8(
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
        52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
        28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
        5, 6, 0, 1, 2);
    const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
    // We never should have that the number of base64 characters + the
    // number of padding characters is more than 4.
    if (!ignore_garbage && (idx + padding_characters > 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit),
              true};
    }
    // The idea here is that in loose mode,
    // if there is padding at all, it must be used
    // to form 4-wise chunk. However, in loose mode,
    // we do accept no padding at all.
    if (!ignore_garbage &&
        last_chunk_options == last_chunk_handling_options::loose &&
        (idx >= 2) && padding_characters > 0 &&
        ((idx + padding_characters) & 3) != 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit),
              true};
    } else
      // The idea here is that in strict mode, we do not want to accept
      // incomplete base64 chunks. So if the chunk was otherwise valid, we
      // return BASE64_INPUT_REMAINDER.
      if (!ignore_garbage &&
          last_chunk_options == last_chunk_handling_options::strict &&
          (idx >= 2) && ((idx + padding_characters) & 3) != 0) {
        // The partial chunk was at src - idx
        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
        dst += output_len;
        return {BASE64_INPUT_REMAINDER, equallocation, size_t(dst - dstinit),
                true};
      } else
        // If there is a partial chunk with insufficient padding, with
        // stop_before_partial, we need to just ignore it. In "only full" mode,
        // skip the minute there are padding characters.
        if ((last_chunk_options ==
                 last_chunk_handling_options::stop_before_partial &&
             (padding_characters + idx < 4) && (idx != 0) &&
             (idx >= 2 || padding_characters == 0)) ||
            (last_chunk_options ==
                 last_chunk_handling_options::only_full_chunks &&
             (idx >= 2 || padding_characters == 0))) {
          _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
          dst += output_len;
          // we need to rewind src to before the partial chunk
          size_t characters_to_skip = idx;
          while (characters_to_skip > 0) {
            src--;
            auto c = *src;
            uint8_t code = to_base64[uint8_t(c)];
            if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) {
              characters_to_skip--;
            }
          }
          // And then we need to skip ignored characters
          // See https://github.com/simdutf/simdutf/issues/824
          while (src > srcinit) {
            auto c = *(src - 1);
            uint8_t code = to_base64[uint8_t(c)];
            if (simdutf::scalar::base64::is_eight_byte(c) && code <= 63) {
              break;
            }
            src--;
          }
          return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
        } else {
          if (idx == 2) {
            if (!ignore_garbage &&
                last_chunk_options == last_chunk_handling_options::strict) {
              uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) +
                                (uint32_t(bufferptr[-1]) << 2 * 6);
              if (triple & 0xffff) {
                _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
                dst += output_len;
                return {BASE64_EXTRA_BITS, size_t(src - srcinit),
                        size_t(dst - dstinit)};
              }
            }
            output_mask = (output_mask << 1) | 1;
            output_len += 1;
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
            dst += output_len;
          } else if (idx == 3) {
            if (!ignore_garbage &&
                last_chunk_options == last_chunk_handling_options::strict) {
              uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) +
                                (uint32_t(bufferptr[-2]) << 2 * 6) +
                                (uint32_t(bufferptr[-1]) << 1 * 6);
              if (triple & 0xff) {
                _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
                dst += output_len;
                return {BASE64_EXTRA_BITS, size_t(src - srcinit),
                        size_t(dst - dstinit)};
              }
            }
            output_mask = (output_mask << 2) | 3;
            output_len += 2;
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
            dst += output_len;
          } else if (!ignore_garbage && idx == 1 &&
                     (!is_partial(last_chunk_options) ||
                      (is_partial(last_chunk_options) &&
                       padding_characters > 0))) {
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
            dst += output_len;
            return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
                    size_t(dst - dstinit)};
          } else if (!ignore_garbage && idx == 0 && padding_characters > 0) {
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
            dst += output_len;
            return {INVALID_BASE64_CHARACTER, equallocation,
                    size_t(dst - dstinit), true};
          } else {
            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
            dst += output_len;
          }
        }
    if (!ignore_garbage && !is_partial(last_chunk_options) &&
        padding_characters > 0) {
      size_t output_count = size_t(dst - dstinit);
      if ((output_count % 3 == 0) ||
          ((output_count % 3) + 1 + padding_characters != 4)) {
        return {INVALID_BASE64_CHARACTER, equallocation, output_count, true};
      }
    }
    return {SUCCESS, full_input_length, size_t(dst - dstinit)};
  }

  if (!ignore_garbage && padding_characters > 0) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + padding_characters != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit),
              true};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}

simdutf_warn_unused size_t icelake_binary_length_from_base64(const char *input,
                                                             size_t length) {
  size_t count = 0;
  const char *ptr = input;
  const char *end = input + length;

  __m512i spaces = _mm512_set1_epi8(0x20);
  while (ptr + 64 <= end) {
    __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr));
    uint64_t mask = _mm512_cmpgt_epi8_mask(data, spaces);
    count += count_ones(mask);
    ptr += 64;
  }

  while (ptr < end) {
    count += (*ptr > 0x20) ? 1 : 0;
    ptr++;
  }

  size_t padding = 0;
  size_t pos = length;
  while (pos > 0 && padding < 2) {
    char c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

simdutf_warn_unused size_t
icelake_binary_length_from_base64(const char16_t *input, size_t length) {
  size_t count = 0;
  const char16_t *ptr = input;
  const char16_t *end = input + length;

  __m512i spaces = _mm512_set1_epi16(0x20);
  while (ptr + 32 <= end) {
    __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptr));
    __mmask32 mask = _mm512_cmpgt_epi16_mask(data, spaces);
    count += _mm_popcnt_u32(mask);
    ptr += 32;
  }

  while (ptr < end) {
    count += (*ptr > 0x20) ? 1 : 0;
    ptr++;
  }

  size_t padding = 0;
  size_t pos = length;
  while (pos > 0 && padding < 2) {
    char16_t c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}
/* end file src/icelake/icelake_base64.inl.cpp */
/* begin file src/icelake/icelake_find.inl.cpp */
simdutf_really_inline const char *util_find(const char *start, const char *end,
                                            char character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  const size_t step = 64;
  __m512i char_vec = _mm512_set1_epi8(character);

  // Handle unaligned beginning with a masked load
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % step;
  if (misalignment != 0) {
    size_t adjustment = step - misalignment;
    if (size_t(end - start) < adjustment) {
      adjustment = end - start;
    }
    __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - adjustment);
    __m512i data = _mm512_maskz_loadu_epi8(
        load_mask, reinterpret_cast<const __m512i *>(start));
    __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec);
    match_mask &= load_mask; // When searching for null terminators, this
                             // prevents false positives

    if (match_mask != 0) {
      size_t index = _tzcnt_u64(match_mask);
      return start + index;
    }
    start += adjustment;
  }
  // Process 64 bytes (512 bits) at a time with AVX-512
  // Main loop for full 128-byte chunks
  while (size_t(end - start) >= 2 * step) {
    __m512i data1 =
        _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start));
    __mmask64 mask1 = _mm512_cmpeq_epi8_mask(data1, char_vec);

    __m512i data2 =
        _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start + step));
    __mmask64 mask2 = _mm512_cmpeq_epi8_mask(data2, char_vec);
    if (!_kortestz_mask64_u8(mask1, mask2)) {
      if (mask1 != 0) {
        // Found a match, return the first one
        size_t index = _tzcnt_u64(mask1);
        return start + index;
      }
      size_t index = _tzcnt_u64(mask2);
      return start + index + step;
    }
    start += 2 * step;
  }

  // Main loop for full 64-byte chunks
  while (size_t(end - start) >= step) {
    __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start));
    __mmask64 mask = _mm512_cmpeq_epi8_mask(data, char_vec);

    if (mask != 0) {
      // Found a match, return the first one
      size_t index = _tzcnt_u64(mask);
      return start + index;
    }

    start += step;
  }

  // Handle remaining bytes with masked load
  size_t remaining = end - start;
  if (remaining > 0) {
    // Create a mask for the remaining bytes using shifted 0xFFFFFFFFFFFFFFFF
    __mmask64 load_mask = 0xFFFFFFFFFFFFFFFF >> (64 - remaining);
    __m512i data = _mm512_maskz_loadu_epi8(
        load_mask, reinterpret_cast<const __m512i *>(start));
    __mmask64 match_mask = _mm512_cmpeq_epi8_mask(data, char_vec);

    // Apply load mask to avoid false positives
    match_mask &= load_mask;

    if (match_mask != 0) {
      // Found a match in the remaining bytes
      size_t index = _tzcnt_u64(match_mask);
      return start + index;
    }
  }

  return end;
}

simdutf_really_inline const char16_t *util_find(const char16_t *start,
                                                const char16_t *end,
                                                char16_t character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;

  // Process 32 char16_t (64 bytes, 512 bits) at a time with AVX-512
  const size_t step = 32;
  __m512i char_vec = _mm512_set1_epi16(character);

  // Handle unaligned beginning with a masked load
  uintptr_t misalignment =
      reinterpret_cast<uintptr_t>(start) % (step * sizeof(char16_t));
  if (misalignment != 0 && misalignment % 2 == 0) {
    size_t adjustment =
        (step * sizeof(char16_t) - misalignment) / sizeof(char16_t);
    if (size_t(end - start) < adjustment) {
      adjustment = end - start;
    }
    __mmask32 load_mask = 0xFFFFFFFF >> (32 - adjustment);
    __m512i data = _mm512_maskz_loadu_epi16(
        load_mask, reinterpret_cast<const __m512i *>(start));
    __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec);
    match_mask &= load_mask; // When searching for null terminators, this
                             // prevents false positives

    if (match_mask != 0) {
      size_t index = _tzcnt_u32(match_mask);
      return start + index;
    }
    start += adjustment;
  }

  // Main loop for full 32-element chunks
  while (size_t(end - start) >= step) {
    __m512i data = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(start));
    __mmask32 mask = _mm512_cmpeq_epi16_mask(data, char_vec);

    if (mask != 0) {
      // Found a match, return the first one
      size_t index = _tzcnt_u32(mask);
      return start + index;
    }

    start += step;
  }

  // Handle remaining elements with masked load
  size_t remaining = end - start;
  if (remaining > 0) {
    __mmask32 load_mask = 0xFFFFFFFF >> (32 - remaining);
    __m512i data = _mm512_maskz_loadu_epi16(
        load_mask, reinterpret_cast<const __m512i *>(start));
    __mmask32 match_mask = _mm512_cmpeq_epi16_mask(data, char_vec);

    if (match_mask != 0) {
      size_t index = _tzcnt_u32(match_mask);
      return start + index;
    }
  }

  return end;
}
/* end file src/icelake/icelake_find.inl.cpp */

#include <cstdint>

} // namespace
} // namespace icelake
} // namespace simdutf

/* begin file src/generic/utf32.h */
#include <limits>

namespace simdutf {
namespace icelake {
namespace {
namespace utf32 {

template <typename T> T min(T a, T b) { return a <= b ? a : b; }

simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
                                                    size_t length) {
  using vector_u32 = simd32<uint32_t>;

  const char32_t *start = input;

  // we add up to three ones in a single iteration (see the vectorized loop in
  // section #2 below)
  const size_t max_increment = 3;

  const size_t N = vector_u32::ELEMENTS;

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
  const auto v_0000007f = vector_u32::splat(0x0000007f);
  const auto v_000007ff = vector_u32::splat(0x000007ff);
  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
#else
  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
  const auto v_fffff800 = vector_u32::splat(0xfffff800);
  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto one = vector_u32::splat(1);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

  size_t counter = 0;

  // 1. vectorized loop unrolled 4 times
  {
    // we use vector of uint32 counters, this is why this limit is used
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
    size_t blocks = length / (N * 4);
    length -= blocks * (N * 4);
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      simd32<uint32_t> acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in0 = vector_u32(input + 0 * N);
        const auto in1 = vector_u32(input + 1 * N);
        const auto in2 = vector_u32(input + 2 * N);
        const auto in3 = vector_u32(input + 3 * N);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in0 > v_0000007f);
        acc -= as_vector_u32(in1 > v_0000007f);
        acc -= as_vector_u32(in2 > v_0000007f);
        acc -= as_vector_u32(in3 > v_0000007f);

        acc -= as_vector_u32(in0 > v_000007ff);
        acc -= as_vector_u32(in1 > v_000007ff);
        acc -= as_vector_u32(in2 > v_000007ff);
        acc -= as_vector_u32(in3 > v_000007ff);

        acc -= as_vector_u32(in0 > v_0000ffff);
        acc -= as_vector_u32(in1 > v_0000ffff);
        acc -= as_vector_u32(in2 > v_0000ffff);
        acc -= as_vector_u32(in3 > v_0000ffff);
#else
        acc += min(one, in0 & v_ffffff80);
        acc += min(one, in1 & v_ffffff80);
        acc += min(one, in2 & v_ffffff80);
        acc += min(one, in3 & v_ffffff80);

        acc += min(one, in0 & v_fffff800);
        acc += min(one, in1 & v_fffff800);
        acc += min(one, in2 & v_fffff800);
        acc += min(one, in3 & v_fffff800);

        acc += min(one, in0 & v_ffff0000);
        acc += min(one, in1 & v_ffff0000);
        acc += min(one, in2 & v_ffff0000);
        acc += min(one, in3 & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += 4 * N;
      }

      counter += acc.sum();
    }
  }

  // 2. vectorized loop for tail
  {
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / max_increment;
    size_t blocks = length / N;
    length -= blocks * N;
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      auto acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in = vector_u32(input);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in > v_0000007f);
        acc -= as_vector_u32(in > v_000007ff);
        acc -= as_vector_u32(in > v_0000ffff);
#else
        acc += min(one, in & v_ffffff80);
        acc += min(one, in & v_fffff800);
        acc += min(one, in & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += N;
      }

      counter += acc.sum();
    }
  }

  const size_t consumed = input - start;
  if (consumed != 0) {
    // We don't count 0th bytes in the vectorized loops above, this
    // is why we need to count them in the end.
    counter += consumed;
  }

  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
}

} // namespace utf32
} // unnamed namespace
} // namespace icelake
} // namespace simdutf
/* end file src/generic/utf32.h */

namespace simdutf {
namespace icelake {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return true;
  }
  avx512_utf8_checker checker{};
  const char *ptr = buf;
  const char *end = ptr + len;
  for (; end - ptr >= 64; ptr += 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    checker.check_next_input(utf8);
  }
  if (end != ptr) {
    const __m512i utf8 = _mm512_maskz_loadu_epi8(
        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
    checker.check_next_input(utf8);
  }
  checker.check_eof();
  return !checker.errors();
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, len);
  }
  avx512_utf8_checker checker{};
  const char *ptr = buf;
  const char *end = ptr + len;
  size_t count{0};
  for (; end - ptr >= 64; ptr += 64) {
    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
    checker.check_next_input(utf8);
    if (checker.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(buf),
          reinterpret_cast<const char *>(buf + count), len - count);
      res.count += count;
      return res;
    }
    count += 64;
  }
  if (end != ptr) {
    const __m512i utf8 = _mm512_maskz_loadu_epi8(
        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
    checker.check_next_input(utf8);
  }
  checker.check_eof();
  if (checker.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(buf),
        reinterpret_cast<const char *>(buf + count), len - count);
    res.count += count;
    return res;
  }
  return result(error_code::SUCCESS, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return icelake::validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  const char *buf_orig = buf;
  const char *end = buf + len;
  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
  for (; end - buf >= 64; buf += 64) {
    const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
    if (notascii) {
      return result(error_code::TOO_LARGE,
                    buf - buf_orig + _tzcnt_u64(notascii));
    }
  }
  if (end != buf) {
    const __m512i input = _mm512_maskz_loadu_epi8(
        ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
    if (notascii) {
      return result(error_code::TOO_LARGE,
                    buf - buf_orig + _tzcnt_u64(notascii));
    }
  }
  return result(error_code::SUCCESS, len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  return icelake::validate_utf32(buf, len);
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  const char32_t *buf_orig = buf;
  if (len >= 16) {
    const char32_t *end = buf + len - 16;
    while (buf <= end) {
      __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
      __mmask16 outside_range = _mm512_cmp_epu32_mask(
          utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);

      __m512i utf32_off =
          _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));

      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
          utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
      if ((outside_range | surrogate_range)) {
        auto outside_idx = _tzcnt_u32(outside_range);
        auto surrogate_idx = _tzcnt_u32(surrogate_range);

        if (outside_idx < surrogate_idx) {
          return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
        }

        return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
      }

      buf += 16;
    }
  }
  if (len > 0) {
    __m512i utf32 = _mm512_maskz_loadu_epi32(
        __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
    __mmask16 outside_range = _mm512_cmp_epu32_mask(
        utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
    __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));

    __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
        utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
    if ((outside_range | surrogate_range)) {
      auto outside_idx = _tzcnt_u32(outside_range);
      auto surrogate_idx = _tzcnt_u32(surrogate_range);

      if (outside_idx < surrogate_idx) {
        return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
      }

      return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
    }
  }

  return result(error_code::SUCCESS, len);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  avx512_convert_latin1_to_utf32(buf, len, utf32_output);
  return len;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  // First, try to convert as much as possible using the SIMD implementation.
  const char *obuf = buf;
  char *olatin1_output = latin1_output;
  size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);

  // If we have completely converted the string
  if (obuf == buf + len) {
    return {simdutf::SUCCESS, written};
  }
  size_t pos = obuf - buf;
  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
      pos, buf + pos, len - pos, latin1_output);
  res.count += pos;
  return res;
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
  utf8_to_utf32_result ret =
      icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
          buf, len, utf32_output);
  if (ret.second == nullptr)
    return 0;

  size_t saved_bytes = ret.second - utf32_output;
  const char *end = buf + len;
  if (ret.first == end) {
    return saved_bytes;
  }

  // Note: the AVX512 procedure looks up 4 bytes forward, and
  //       correctly converts multi-byte chars even if their
  //       continuation bytes lie outside 16-byte window.
  //       It means, we have to skip continuation bytes from
  //       the beginning ret.first, as they were already consumed.
  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
    ret.first += 1;
  }
  if (ret.first != end) {
    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }

  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return {error_code::SUCCESS, 0};
  }
  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
      endianness::LITTLE, uint32_t>(buf, len, utf32_output);

  if (!std::get<2>(ret)) {
    size_t pos = std::get<0>(ret) - buf;
    // We might have an error that occurs right before  pos.
    // This is only a concern if buf[pos] is not a continuation byte.
    if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
      pos -= 1;
    } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
      // We must check whether we are the fourth continuation byte
      bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
      bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
      bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
      if (c1 && c2 && c3) {
        return {simdutf::TOO_LONG, pos};
      }
    }
    // todo: we reset the output to utf32 instead of using std::get<2.(ret) as
    // you'd expect. that is because
    // validating_utf8_to_fixed_length_with_constant_checks may have processed
    // data beyond the error.
    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
        pos, buf + pos, len - pos, utf32);
    res.count += pos;
    return res;
  }
  size_t saved_bytes = std::get<1>(ret) - utf32_output;
  const char *end = buf + len;
  if (std::get<0>(ret) == end) {
    return {simdutf::SUCCESS, saved_bytes};
  }

  // Note: the AVX512 procedure looks up 4 bytes forward, and
  //       correctly converts multi-byte chars even if their
  //       continuation bytes lie outside 16-byte window.
  //       It means, we have to skip continuation bytes from
  //       the beginning ret.first, as they were already consumed.
  while (std::get<0>(ret) != end and
         ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
    std::get<0>(ret) += 1;
  }

  if (std::get<0>(ret) != end) {
    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
        std::get<0>(ret), len - (std::get<0>(ret) - buf),
        reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
    if (scalar_result.error != simdutf::SUCCESS) {
      scalar_result.count += (std::get<0>(ret) - buf);
    } else {
      scalar_result.count += saved_bytes;
    }
    return scalar_result;
  }

  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
  utf8_to_utf32_result ret =
      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
          buf, len, utf32_output);
  size_t saved_bytes = ret.second - utf32_output;
  const char *end = buf + len;
  if (ret.first == end) {
    return saved_bytes;
  }

  // Note: AVX512 procedure looks up 4 bytes forward, and
  //       correctly converts multi-byte chars even if their
  //       continuation bytes lie outsiede 16-byte window.
  //       It meas, we have to skip continuation bytes from
  //       the beginning ret.first, as they were already consumed.
  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
    ret.first += 1;
  }

  if (ret.first != end) {
    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }

  return saved_bytes;
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
      .first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      avx512_convert_utf32_to_utf8(buf, len, utf8_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - utf8_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      utf8_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return convert_utf32_to_utf8(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
  size_t answer =
      length / sizeof(__m512i) *
      sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
  size_t i = 0;
  __m512i unrolled_popcount{0};

  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));

  while (i + sizeof(__m512i) <= length) {
    size_t iterations = (length - i) / sizeof(__m512i);

    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
    for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
      __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
      __m512i input2 =
          _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
      __m512i input3 =
          _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
      __m512i input4 =
          _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
      __m512i input5 =
          _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
      __m512i input6 =
          _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
      __m512i input7 =
          _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
      __m512i input8 =
          _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));

      __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
      __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
      __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
      __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
      __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
      __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
      __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
      __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);

      __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
                                               mask4, mask3, mask2, mask1);

      unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
                                           _mm512_popcnt_epi64(mask_register));
    }

    for (; i <= max_i; i += sizeof(__m512i)) {
      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
      uint64_t continuation_bitmask = static_cast<uint64_t>(
          _mm512_cmple_epi8_mask(more_input, continuation));
      answer -= count_ones(continuation_bitmask);
    }
  }

  answer -= _mm512_reduce_add_epi64(unrolled_popcount);

  return answer + scalar::utf8::count_code_points(
                      reinterpret_cast<const char *>(str + i), length - i);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t length) const noexcept {
  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
  size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
  size_t i = 0;
  if (answer >= 2048) // long strings optimization
  {
    unsigned char v_0xFF = 0xff;
    __m512i eight_64bits = _mm512_setzero_si512();
    while (i + sizeof(__m512i) <= length) {
      __m512i runner = _mm512_setzero_si512();
      size_t iterations = (length - i) / sizeof(__m512i);
      if (iterations > 255) {
        iterations = 255;
      }
      size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
      for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
        // Load four __m512i vectors
        __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
        __m512i input2 =
            _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
        __m512i input3 = _mm512_loadu_si512(
            (const __m512i *)(str + i + 2 * sizeof(__m512i)));
        __m512i input4 = _mm512_loadu_si512(
            (const __m512i *)(str + i + 3 * sizeof(__m512i)));

        // Generate four masks
        __mmask64 mask1 =
            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
        __mmask64 mask2 =
            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
        __mmask64 mask3 =
            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
        __mmask64 mask4 =
            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
        // Apply the masks and subtract from the runner
        __m512i not_ascii1 =
            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
        __m512i not_ascii2 =
            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
        __m512i not_ascii3 =
            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
        __m512i not_ascii4 =
            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);

        runner = _mm512_sub_epi8(runner, not_ascii1);
        runner = _mm512_sub_epi8(runner, not_ascii2);
        runner = _mm512_sub_epi8(runner, not_ascii3);
        runner = _mm512_sub_epi8(runner, not_ascii4);
      }

      for (; i <= max_i; i += sizeof(__m512i)) {
        __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));

        __mmask64 mask =
            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
        __m512i not_ascii =
            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
        runner = _mm512_sub_epi8(runner, not_ascii);
      }

      eight_64bits = _mm512_add_epi64(
          eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
    }

    answer += _mm512_reduce_add_epi64(eight_64bits);
  } else if (answer > 0) {
    for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
      __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
      uint64_t non_ascii = _mm512_movepi8_mask(latin);
      answer += count_ones(non_ascii);
    }
  }
  return answer + scalar::latin1::utf8_length_from_latin1(
                      reinterpret_cast<const char *>(str + i), length - i);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return implementation::count_utf8(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64<true>(output, input, length, options);
  } else {
    return encode_base64<false>(output, input, length, options);
  }
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64_impl<true, true>(output, input, length, options,
                                          line_length);
  } else {
    return encode_base64_impl<false, true>(output, input, length, options,
                                           line_length);
  }
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util_find(start, end, character);
}
const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util_find(start, end, character);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return icelake_binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return icelake_binary_length_from_base64(input, length);
}

} // namespace icelake
} // namespace simdutf

/* begin file src/simdutf/icelake/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif


#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
SIMDUTF_POP_DISABLE_WARNINGS
#endif // end of workaround
/* end file src/simdutf/icelake/end.h */
/* end file src/icelake/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_HASWELL
/* begin file src/haswell/implementation.cpp */
/* begin file src/simdutf/haswell/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "haswell"
// #define SIMDUTF_IMPLEMENTATION haswell
#define SIMDUTF_SIMD_HAS_BYTEMASK 1

#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
// nothing needed.
#else
SIMDUTF_TARGET_HASWELL
#endif

#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
// clang-format off
SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
// clang-format on
#endif // end of workaround
/* end file src/simdutf/haswell/begin.h */

namespace simdutf {
namespace haswell {
namespace {
#ifndef SIMDUTF_HASWELL_H
  #error "haswell.h must be included"
#endif
using namespace simd;

simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
  return input.reduce_or().is_ascii();
}

simdutf_really_inline simd8<bool>
must_be_2_3_continuation(const simd8<uint8_t> prev2,
                         const simd8<uint8_t> prev3) {
  simd8<uint8_t> is_third_byte =
      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
  simd8<uint8_t> is_fourth_byte =
      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
  return simd8<bool>(is_third_byte | is_fourth_byte);
}

/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
std::pair<const char *, char *>
avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
                            char *utf8_output) {
  const char *end = latin1_input + len;
  const __m256i v_0000 = _mm256_setzero_si256();
  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
  const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
  const size_t safety_margin = 12;

  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
    __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
    const __m128i v_80 = _mm_set1_epi8((char)0x80);
    if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
      // 1. store (16 bytes)
      _mm_storeu_si128((__m128i *)utf8_output, in8);
      // 2. adjust pointers
      latin1_input += 16;
      utf8_output += 16;
      continue; // we are done for this round!
    }
    // We proceed only with the first 16 bytes.
    const __m256i in = _mm256_cvtepu8_epi16((in8));

    // 1. prepare 2-byte values
    // input 16-bit word : [0000|0000|aabb|bbbb] x 8
    // expected output   : [1100|00aa|10bb|bbbb] x 8
    const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
    const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);

    // t0 = [0000|00aa|bbbb|bb00]
    const __m256i t0 = _mm256_slli_epi16(in, 2);
    // t1 = [0000|00aa|0000|0000]
    const __m256i t1 = _mm256_and_si256(t0, v_1f00);
    // t2 = [0000|0000|00bb|bbbb]
    const __m256i t2 = _mm256_and_si256(in, v_003f);
    // t3 = [000a|aaaa|00bb|bbbb]
    const __m256i t3 = _mm256_or_si256(t1, t2);
    // t4 = [1100|00aa|10bb|bbbb]
    const __m256i t4 = _mm256_or_si256(t3, v_c080);

    // 2. merge ASCII and 2-byte codewords

    // no bits set above 7th bit
    const __m256i one_byte_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
    const uint32_t one_byte_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));

    const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);

    // 3. prepare bitmask for 8-bit lookup
    const uint32_t M0 = one_byte_bitmask & 0x55555555;
    const uint32_t M1 = M0 >> 7;
    const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
    // 4. pack the bytes

    const uint8_t *row =
        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
    const uint8_t *row_2 =
        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
                                                            [0];

    const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
    const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));

    const __m256i utf8_packed = _mm256_shuffle_epi8(
        utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
    // 5. store bytes
    _mm_storeu_si128((__m128i *)utf8_output,
                     _mm256_castsi256_si128(utf8_packed));
    utf8_output += row[0];
    _mm_storeu_si128((__m128i *)utf8_output,
                     _mm256_extractf128_si256(utf8_packed, 1));
    utf8_output += row_2[0];

    // 6. adjust pointers
    latin1_input += 16;
    continue;

  } // while
  return std::make_pair(latin1_input, utf8_output);
}
/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */

/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
std::pair<const char *, char32_t *>
avx2_convert_latin1_to_utf32(const char *buf, size_t len,
                             char32_t *utf32_output) {
  size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8

  for (size_t i = 0; i < rounded_len; i += 8) {
    // Load 8 Latin1 characters into a 64-bit register
    __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);

    // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
    // vpmovzxbd
    __m256i out = _mm256_cvtepu8_epi32(in);

    // Store the results back to memory
    _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
  }

  // return pointers pointing to where we left off
  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
}
/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */

/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
// depends on "tables/utf8_to_utf16_tables.h"

// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  const __m128i in = _mm_loadu_si128((__m128i *)input);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xfff;
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process the data in chunks of 12 bytes.
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
                        _mm256_cvtepu8_epi32(in));
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
                        _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
    utf32_output += 12; // We wrote 12 32-bit characters.
    return 12;          // We consumed 12 bytes.
  }
  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
    // UTF-32 code units. There is probably a more efficient sequence, but the
    // following might do.
    const __m128i sh =
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
    _mm256_storeu_si256((__m256i *)utf32_output,
                        _mm256_cvtepu16_epi32(composed));
    utf32_output += 8; // We wrote 16 bytes, 8 code points.
    return 16;
  }
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units. There is probably a more efficient sequence, but the
    // following might do.
    const __m128i sh =
        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii =
        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
    const __m128i middlebyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
    const __m128i highbyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
    const __m128i composed =
        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
    _mm_storeu_si128((__m128i *)utf32_output, composed);
    utf32_output += 4;
    return 12;
  }
  /// We do not have a fast path available, so we fallback.

  const uint8_t idx =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
  if (idx < 64) {
    // SIX (6) input code-code units
    // this is a relatively easy scenario
    // we process SIX (6) input code-code units. The max length in bytes of six
    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
    // processors where pdep/pext is fast, we might be able to use a small
    // lookup table.
    const __m128i sh =
        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
    _mm256_storeu_si256((__m256i *)utf32_output,
                        _mm256_cvtepu16_epi32(composed));
    utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
    // overflow of 32 - 24 = 8 bytes.
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    const __m128i sh =
        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii =
        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
    const __m128i middlebyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
    const __m128i highbyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
    const __m128i composed =
        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
    _mm_storeu_si128((__m128i *)utf32_output, composed);
    utf32_output += 4;
  } else if (idx < 209) {
    // TWO (2) input code-code units
    const __m128i sh =
        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
    // correct for spurious high bit
    const __m128i correct =
        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
    const __m128i composed =
        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
    _mm_storeu_si128((__m128i *)utf32_output, composed);
    utf32_output +=
        3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
  } else {
    // here we know that there is an error but we do not handle errors
  }
  return consumed;
}
/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */

/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
std::pair<const char32_t *, char *>
avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
                             char *latin1_output) {
  const size_t rounded_len =
      len & ~0x1F; // Round down to nearest multiple of 32

  const __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);

  for (size_t i = 0; i < rounded_len; i += 4 * 8) {
    __m256i a = _mm256_loadu_si256((__m256i *)(buf + 0 * 8));
    __m256i b = _mm256_loadu_si256((__m256i *)(buf + 1 * 8));
    __m256i c = _mm256_loadu_si256((__m256i *)(buf + 2 * 8));
    __m256i d = _mm256_loadu_si256((__m256i *)(buf + 3 * 8));

    const __m256i check_combined =
        _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));

    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
      return std::make_pair(nullptr, latin1_output);
    }

    b = _mm256_slli_epi32(b, 1 * 8);
    c = _mm256_slli_epi32(c, 2 * 8);
    d = _mm256_slli_epi32(d, 3 * 8);

    // clang-format off

    // a  = [.. .. .. a7|.. .. .. a6|.. .. .. a5|.. .. .. a4||.. .. .. a3|.. .. .. a2|.. .. .. a1|.. .. .. a0]
    // b  = [.. .. b7 ..|.. .. b6 ..|.. .. b5 ..|.. .. b4 ..||.. .. b3 ..|.. .. b2 ..|.. .. b1 ..|.. .. b0 ..]
    // c  = [.. c7 .. ..|.. c6 .. ..|.. c5 .. ..|.. c4 .. ..||.. c3 .. ..|.. c2 .. ..|.. c1 .. ..|.. c0 .. ..]
    // d  = [d7 .. .. ..|d6 .. .. ..|d5 .. .. ..|d4 .. .. ..||d3 .. .. ..|d2 .. .. ..|d1 .. .. ..|d0 .. .. ..]

    // t0 = [d7 c7 b7 a7|d6 c6 b6 a6|d5 c5 b5 a5|d4 c4 b4 a4||d3 c3 b3 a3|d2 c2 b2 a2|d1 c1 b1 a1|d0 c0 b0 a0]
    const __m256i t0 =
        _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));

    // shuffle bytes within 128-bit lanes
    // t1 = [d7 d6 d5 d4|c7 c6 c5 c4|b7 b6 b5 b4|a7 a6 a5 a4||d3 d2 d1 d0|c3 c2 c1 c0|b3 b2 b1 b0|a3 a2 a1 a0]
    const __m256i shuffle_bytes =
        _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);

    const __m256i t1 = _mm256_shuffle_epi8(t0, shuffle_bytes);

    // reshuffle dwords
    // t2 = [d7 d6 d5 d4|d3 d2 d1 d0|c7 c6 c5 c4|c3 c2 c1 c0||b7 b6 b5 b4|b3 b2 b1 b0|a7 a6 a5 a4|a3 a2 a1 a0]
    const __m256i shuffle_dwords = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
    const __m256i t2 = _mm256_permutevar8x32_epi32(t1, shuffle_dwords);
// clang format on

    _mm256_storeu_si256((__m256i *)latin1_output, t2);

    latin1_output += 32;
    buf += 32;
  }

  return std::make_pair(buf, latin1_output);
}

std::pair<result, char *>
avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                         char *latin1_output) {
  const size_t rounded_len =
      len & ~0x1F; // Round down to nearest multiple of 32

  const char32_t *start = buf;

  const __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);

  for (size_t i = 0; i < rounded_len; i += 4 * 8) {
    __m256i a = _mm256_loadu_si256((__m256i *)(buf + 0 * 8));
    __m256i b = _mm256_loadu_si256((__m256i *)(buf + 1 * 8));
    __m256i c = _mm256_loadu_si256((__m256i *)(buf + 2 * 8));
    __m256i d = _mm256_loadu_si256((__m256i *)(buf + 3 * 8));

    const __m256i check_combined =
        _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));

    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
      // Fallback to scalar code for handling errors
      for (int k = 0; k < 4 * 8; k++) {
        char32_t codepoint = buf[k];
        if (codepoint <= 0xFF) {
          *latin1_output++ = static_cast<char>(codepoint);
        } else {
          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                latin1_output);
        }
      }
    }

    b = _mm256_slli_epi32(b, 1 * 8);
    c = _mm256_slli_epi32(c, 2 * 8);
    d = _mm256_slli_epi32(d, 3 * 8);

    const __m256i t0 =
        _mm256_or_si256(_mm256_or_si256(a, b), _mm256_or_si256(c, d));

    const __m256i shuffle_bytes =
        _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);

    const __m256i t1 = _mm256_shuffle_epi8(t0, shuffle_bytes);

    const __m256i shuffle_dwords = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
    const __m256i t2 = _mm256_permutevar8x32_epi32(t1, shuffle_dwords);

    _mm256_storeu_si256((__m256i *)latin1_output, t2);

    latin1_output += 32;
    buf += 32;
  }

  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        latin1_output);
}
/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */

/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
std::pair<const char32_t *, char *>
avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
  const char32_t *end = buf + len;
  const __m256i v_0000 = _mm256_setzero_si256();
  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
  __m256i running_max = _mm256_setzero_si256();
  __m256i forbidden_bytemask = _mm256_setzero_si256();

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
    __m256i in = _mm256_loadu_si256((__m256i *)buf);
    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
                                        _mm256_and_si256(nextin, v_7fffffff));
    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);

    // Try to apply UTF-16 => UTF-8 routine on 256 bits
    // (haswell/avx2_convert_utf16_to_utf8.cpp)

    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
      // 1. pack the bytes
      const __m128i utf8_packed = _mm_packus_epi16(
          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
      // 2. store (16 bytes)
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
      // 3. adjust pointers
      buf += 16;
      utf8_output += 16;
      continue; // we are done for this round!
    }
    // no bits set above 7th bit
    const __m256i one_byte_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
    const uint32_t one_byte_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));

    // no bits set above 11th bit
    const __m256i one_or_two_bytes_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
    const uint32_t one_or_two_bytes_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
    if (one_or_two_bytes_bitmask == 0xffffffff) {
      // 1. prepare 2-byte values
      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
      // expected output   : [110a|aaaa|10bb|bbbb] x 8
      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);

      // t0 = [000a|aaaa|bbbb|bb00]
      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
      // t1 = [000a|aaaa|0000|0000]
      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
      // t2 = [0000|0000|00bb|bbbb]
      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
      // t3 = [000a|aaaa|00bb|bbbb]
      const __m256i t3 = _mm256_or_si256(t1, t2);
      // t4 = [110a|aaaa|10bb|bbbb]
      const __m256i t4 = _mm256_or_si256(t3, v_c080);

      // 2. merge ASCII and 2-byte codewords
      const __m256i utf8_unpacked =
          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);

      // 3. prepare bitmask for 8-bit lookup
      const uint32_t M0 = one_byte_bitmask & 0x55555555;
      const uint32_t M1 = M0 >> 7;
      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
      // 4. pack the bytes

      const uint8_t *row =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
      const uint8_t *row_2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
                                                                       16)][0];

      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));

      const __m256i utf8_packed = _mm256_shuffle_epi8(
          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
      // 5. store bytes
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_castsi256_si128(utf8_packed));
      utf8_output += row[0];
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_extractf128_si256(utf8_packed, 1));
      utf8_output += row_2[0];

      // 6. adjust pointers
      buf += 16;
      continue;
    }
    // Must check for overflow in packing
    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
    const uint32_t saturation_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
    if (saturation_bitmask == 0xffffffff) {
      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
      forbidden_bytemask = _mm256_or_si256(
          forbidden_bytemask,
          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));

      const __m256i dup_even = _mm256_setr_epi16(
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);

      /* In this branch we handle three cases:
        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
        single UFT-8 byte
        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
        UTF-8 bytes
        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
        three UTF-8 bytes

        We expand the input word (16-bit) into two code units (32-bit), thus
        we have room for four bytes. However, we need five distinct bit
        layouts. Note that the last byte in cases #2 and #3 is the same.

        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
        in register t2.

        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
        either byte 1 for case #2 or byte 2 for case #3. Note that they
        differ by exactly one bit.

        Finally from these two code units we build proper UTF-8 sequence, taking
        into account the case (i.e, the number of bytes to write).
      */
      /**
       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
       * t2 => [0ccc|cccc] [10cc|cccc]
       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
       */
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));

      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
                                             simdutf_vec(0b0100000000000000));
      const __m256i s4 = _mm256_xor_si256(s3, m0);
#undef simdutf_vec

      // 4. expand code units 16-bit => 32-bit
      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);

      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
      // Due to the wider registers, the following path is less likely to be
      // useful.
      /*if(mask == 0) {
        // We only have three-byte code units. Use fast path.
        const __m256i shuffle =
      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
      _mm256_shuffle_epi8(out1, shuffle);
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
        continue;
      }*/
      const uint8_t mask0 = uint8_t(mask);
      const uint8_t *row0 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
      const __m128i utf8_0 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);

      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
      const uint8_t *row1 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
      const __m128i utf8_1 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);

      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
      const uint8_t *row2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
      const __m128i utf8_2 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);

      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
      const uint8_t *row3 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
      const __m128i utf8_3 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);

      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
      utf8_output += row0[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
      utf8_output += row1[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
      utf8_output += row2[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
      utf8_output += row3[0];
      buf += 16;
    } else {
      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD may require
      // large, non-trivial tables?
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(nullptr, utf8_output);
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else { // 4-byte
          if (word > 0x10FFFF) {
            return std::make_pair(nullptr, utf8_output);
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  // check for invalid input
  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
    return std::make_pair(nullptr, utf8_output);
  }

  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
    return std::make_pair(nullptr, utf8_output);
  }

  return std::make_pair(buf, utf8_output);
}

std::pair<result, char *>
avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
                                       char *utf8_output) {
  const char32_t *end = buf + len;
  const char32_t *start = buf;

  const __m256i v_0000 = _mm256_setzero_si256();
  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
    __m256i in = _mm256_loadu_si256((__m256i *)buf);
    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
    // Check for too large input
    const __m256i max_input =
        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
    if (static_cast<uint32_t>(_mm256_movemask_epi8(
            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                            utf8_output);
    }

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
                                        _mm256_and_si256(nextin, v_7fffffff));
    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);

    // Try to apply UTF-16 => UTF-8 routine on 256 bits
    // (haswell/avx2_convert_utf16_to_utf8.cpp)

    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
      // 1. pack the bytes
      const __m128i utf8_packed = _mm_packus_epi16(
          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
      // 2. store (16 bytes)
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
      // 3. adjust pointers
      buf += 16;
      utf8_output += 16;
      continue; // we are done for this round!
    }
    // no bits set above 7th bit
    const __m256i one_byte_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
    const uint32_t one_byte_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));

    // no bits set above 11th bit
    const __m256i one_or_two_bytes_bytemask =
        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
    const uint32_t one_or_two_bytes_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
    if (one_or_two_bytes_bitmask == 0xffffffff) {
      // 1. prepare 2-byte values
      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
      // expected output   : [110a|aaaa|10bb|bbbb] x 8
      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);

      // t0 = [000a|aaaa|bbbb|bb00]
      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
      // t1 = [000a|aaaa|0000|0000]
      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
      // t2 = [0000|0000|00bb|bbbb]
      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
      // t3 = [000a|aaaa|00bb|bbbb]
      const __m256i t3 = _mm256_or_si256(t1, t2);
      // t4 = [110a|aaaa|10bb|bbbb]
      const __m256i t4 = _mm256_or_si256(t3, v_c080);

      // 2. merge ASCII and 2-byte codewords
      const __m256i utf8_unpacked =
          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);

      // 3. prepare bitmask for 8-bit lookup
      const uint32_t M0 = one_byte_bitmask & 0x55555555;
      const uint32_t M1 = M0 >> 7;
      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
      // 4. pack the bytes

      const uint8_t *row =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
      const uint8_t *row_2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
                                                                       16)][0];

      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));

      const __m256i utf8_packed = _mm256_shuffle_epi8(
          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
      // 5. store bytes
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_castsi256_si128(utf8_packed));
      utf8_output += row[0];
      _mm_storeu_si128((__m128i *)utf8_output,
                       _mm256_extractf128_si256(utf8_packed, 1));
      utf8_output += row_2[0];

      // 6. adjust pointers
      buf += 16;
      continue;
    }
    // Must check for overflow in packing
    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
    const uint32_t saturation_bitmask =
        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
    if (saturation_bitmask == 0xffffffff) {
      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes

      // Check for illegal surrogate code units
      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
      const __m256i forbidden_bytemask =
          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
          0x0) {
        return std::make_pair(result(error_code::SURROGATE, buf - start),
                              utf8_output);
      }

      const __m256i dup_even = _mm256_setr_epi16(
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);

      /* In this branch we handle three cases:
        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
        single UFT-8 byte
        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
        UTF-8 bytes
        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
        three UTF-8 bytes

        We expand the input word (16-bit) into two code units (32-bit), thus
        we have room for four bytes. However, we need five distinct bit
        layouts. Note that the last byte in cases #2 and #3 is the same.

        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
        in register t2.

        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
        either byte 1 for case #2 or byte 2 for case #3. Note that they
        differ by exactly one bit.

        Finally from these two code units we build proper UTF-8 sequence, taking
        into account the case (i.e, the number of bytes to write).
      */
      /**
       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
       * t2 => [0ccc|cccc] [10cc|cccc]
       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
       */
#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));

      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
                                             simdutf_vec(0b0100000000000000));
      const __m256i s4 = _mm256_xor_si256(s3, m0);
#undef simdutf_vec

      // 4. expand code units 16-bit => 32-bit
      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);

      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
      // Due to the wider registers, the following path is less likely to be
      // useful.
      /*if(mask == 0) {
        // We only have three-byte code units. Use fast path.
        const __m256i shuffle =
      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
      _mm256_shuffle_epi8(out1, shuffle);
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
        utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
        _mm_storeu_si128((__m128i*)utf8_output,
      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
        continue;
      }*/
      const uint8_t mask0 = uint8_t(mask);
      const uint8_t *row0 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
      const __m128i utf8_0 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);

      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
      const uint8_t *row1 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
      const __m128i utf8_1 =
          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);

      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
      const uint8_t *row2 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
      const __m128i utf8_2 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);

      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
      const uint8_t *row3 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
      const __m128i utf8_3 =
          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);

      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
      utf8_output += row0[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
      utf8_output += row1[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
      utf8_output += row2[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
      utf8_output += row3[0];
      buf += 16;
    } else {
      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD may require
      // large, non-trivial tables?
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(
                result(error_code::SURROGATE, buf - start + k), utf8_output);
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else { // 4-byte
          if (word > 0x10FFFF) {
            return std::make_pair(
                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
}
/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */

/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
// depends on "tables/utf8_to_utf16_tables.h"

// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_latin1(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
                                     char *&latin1_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  const __m128i in = _mm_loadu_si128((__m128i *)input);

  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask &
      0xfff; // we are only processing 12 bytes in case it is not all ASCII

  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process the data in chunks of 12 bytes.
    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
    latin1_output += 12; // We wrote 12 characters.
    return 12;           // We consumed 1 bytes.
  }
  /// We do not have a fast path available, so we fallback.
  const uint8_t idx =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
  // this indicates an invalid input:
  if (idx >= 64) {
    return consumed;
  }
  // Here we should have (idx < 64), if not, there is a bug in the validation or
  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
  // processors where pdep/pext is fast, we might be able to use a small lookup
  // table.
  const __m128i sh =
      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
  const __m128i perm = _mm_shuffle_epi8(in, sh);
  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
  // writing 8 bytes even though we only care about the first 6 bytes.
  // performance note: it would be faster to use _mm_storeu_si128, we should
  // investigate.
  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
  latin1_output += 6; // We wrote 6 bytes.
  return consumed;
}
/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */

/* begin file src/haswell/avx2_base64.cpp */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */

template <bool base64_url>
simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
  // Precomputed shuffle masks for K = 1 to 16
  // credit: Wojciech Muła
  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
  result =
      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
  __m256i shift_LUT;
  if (base64_url) {
    shift_LUT = _mm256_setr_epi8(
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,

        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
  } else {
    shift_LUT = _mm256_setr_epi8(
        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,

        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
  }

  result = _mm256_shuffle_epi8(shift_LUT, result);
  return _mm256_add_epi8(result, input);
}

simdutf_really_inline __m256i insert_line_feed32(__m256i input, int K) {

  static const uint8_t low_table[16][32] = {
      {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0,    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1,    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2,    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3,    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3, 4,    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6,    7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7,    8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8,    9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9,  10, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9,    10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,   11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,   13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,   14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,   15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
  static const uint8_t high_table[16][32] = {
      {0,    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1,    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2,    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3,    4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4,    5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6,    7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7,    8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8,    9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,    10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9,  10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,   11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,   12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,   13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,   14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,   15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}};

  __m256i line_feed_vector = _mm256_set1_epi8('\n');
  if (K >= 16) {
    __m256i mask = _mm256_loadu_si256((const __m256i *)high_table[K - 16]);
    __m256i lf_pos =
        _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80)));
    __m256i shuffled = _mm256_shuffle_epi8(input, mask);
    __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
    return result;
  }
  // Shift input right by 1 byte
  __m256i shift = _mm256_alignr_epi8(
      input, _mm256_permute2x128_si256(input, input, 0x21), 15);

  input = _mm256_blend_epi32(input, shift, 0xF0);

  __m256i mask = _mm256_loadu_si256((const __m256i *)low_table[K]);

  __m256i lf_pos =
      _mm256_cmpeq_epi8(mask, _mm256_set1_epi8(static_cast<char>(0x80)));
  __m256i shuffled = _mm256_shuffle_epi8(input, mask);

  __m256i result = _mm256_blendv_epi8(shuffled, line_feed_vector, lf_pos);
  return result;
}

template <bool isbase64url, bool use_lines>
size_t
avx2_encode_base64_impl(char *dst, const char *src, size_t srclen,
                        base64_options options,
                        size_t line_length = simdutf::default_line_length) {
  size_t offset = 0;

  if (line_length < 4) {
    line_length = 4; // We do not support line_length less than 4
  }
  // credit: Wojciech Muła
  const uint8_t *input = (const uint8_t *)src;

  uint8_t *out = (uint8_t *)dst;
  const __m256i shuf =
      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,

                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
  size_t i = 0;
  for (; i + 100 <= srclen; i += 96) {
    const __m128i lo0 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
    const __m128i hi0 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
    const __m128i lo1 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
    const __m128i hi1 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
    const __m128i lo2 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
    const __m128i hi2 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
    const __m128i lo3 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
    const __m128i hi3 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));

    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);

    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));

    const __m256i t1_0 =
        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
    const __m256i t1_1 =
        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
    const __m256i t1_2 =
        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
    const __m256i t1_3 =
        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));

    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));

    const __m256i t3_0 =
        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
    const __m256i t3_1 =
        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
    const __m256i t3_2 =
        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
    const __m256i t3_3 =
        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));

    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);

    if (use_lines) {
      if (line_length >= 32) { // fast path
        __m256i result;
        result = lookup_pshufb_improved<isbase64url>(input0);
        if (offset + 32 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 32 - location_end;
          // We could do this, or extract instead.
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
          _mm256_storeu_si256(
              reinterpret_cast<__m256i *>(out),
              insert_line_feed32(result, static_cast<int>(location_end)));
          offset = to_move;
          out += 32 + 1;
        } else {
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
          offset += 32;
          out += 32;
        }
        result = lookup_pshufb_improved<isbase64url>(input1);

        if (offset + 32 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 32 - location_end;

          // We could do this, or extract instead.
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
          _mm256_storeu_si256(
              reinterpret_cast<__m256i *>(out),
              insert_line_feed32(result, static_cast<int>(location_end)));
          // see above.
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
          offset = to_move;
          out += 32 + 1;
        } else {

          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);

          offset += 32;
          out += 32;
        }
        result = lookup_pshufb_improved<isbase64url>(input2);

        if (offset + 32 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 32 - location_end;

          // We could do this, or extract instead.
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
          _mm256_storeu_si256(
              reinterpret_cast<__m256i *>(out),
              insert_line_feed32(result, static_cast<int>(location_end)));
          // see above.
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
          offset = to_move;
          out += 32 + 1;
        } else {
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
          offset += 32;
          out += 32;
        }
        result = lookup_pshufb_improved<isbase64url>(input3);

        if (offset + 32 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 32 - location_end;

          // We could do this, or extract instead.
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 1), result);
          _mm256_storeu_si256(
              reinterpret_cast<__m256i *>(out),
              insert_line_feed32(result, static_cast<int>(location_end)));
          // see above.
          // out[32] = static_cast<uint8_t>(_mm256_extract_epi8(result, 31));
          offset = to_move;
          out += 32 + 1;
        } else {
          _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), result);
          offset += 32;
          out += 32;
        }
      } else { // slow path
        // could be optimized
        uint8_t buffer[128];
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
                            lookup_pshufb_improved<isbase64url>(input0));
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 32),
                            lookup_pshufb_improved<isbase64url>(input1));
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 64),
                            lookup_pshufb_improved<isbase64url>(input2));
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + 96),
                            lookup_pshufb_improved<isbase64url>(input3));
        size_t out_pos = 0;
        size_t local_offset = offset;
        for (size_t j = 0; j < 128;) {
          if (local_offset == line_length) {
            out[out_pos++] = '\n';
            local_offset = 0;
          }
          out[out_pos++] = buffer[j++];
          local_offset++;
        }
        offset = local_offset;
        out += out_pos;
      }
    } else {
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
                          lookup_pshufb_improved<isbase64url>(input0));
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32),
                          lookup_pshufb_improved<isbase64url>(input1));
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 64),
                          lookup_pshufb_improved<isbase64url>(input2));
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 96),
                          lookup_pshufb_improved<isbase64url>(input3));

      out += 128;
    }
  }
  for (; i + 28 <= srclen; i += 24) {
    // lo = [xxxx|DDDC|CCBB|BAAA]
    // hi = [xxxx|HHHG|GGFF|FEEE]
    const __m128i lo =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
    const __m128i hi =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));

    // bytes from groups A, B and C are needed in separate 32-bit lanes
    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);

    // this part is well commented in encode.sse.cpp

    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
    const __m256i indices = _mm256_or_si256(t1, t3);

    if (use_lines) {
      if (line_length >= 32) { // fast path
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
                            lookup_pshufb_improved<isbase64url>(indices));

        if (offset + 32 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 32 - location_end;
          std::memmove(out + location_end + 1, out + location_end, to_move);
          out[location_end] = '\n';
          offset = to_move;
          out += 32 + 1;
        } else {
          offset += 32;
          out += 32;
        }
      } else { // slow path
        // could be optimized
        alignas(32) uint8_t buffer[32];
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer),
                            lookup_pshufb_improved<isbase64url>(indices));
        std::memcpy(out, buffer, 32);
        size_t out_pos = 0;
        size_t local_offset = offset;
        for (size_t j = 0; j < 32;) {
          if (local_offset == line_length) {
            out[out_pos++] = '\n';
            local_offset = 0;
          }
          out[out_pos++] = buffer[j++];
          local_offset++;
        }
        offset = local_offset;
        out += out_pos;
      }
    } else {
      _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
                          lookup_pshufb_improved<isbase64url>(indices));

      out += 32;
    }
  }
  return ((char *)out - (char *)dst) +
         scalar::base64::tail_encode_base64_impl<use_lines>(
             (char *)out, src + i, srclen - i, options, line_length, offset);
}

template <bool isbase64url>
size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {
  return avx2_encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
}

static inline void compress(__m128i data, uint16_t mask, char *output) {
  if (mask == 0) {
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
    return;
  }
  // this particular implementation was inspired by work done by @animetosho
  // we do it in two steps, first 8 bytes and then second 8 bytes
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
  // next line just loads the 64-bit values thintable_epi8[mask1] and
  // thintable_epi8[mask2] into a 128-bit register, using only
  // two instructions on most compilers.

  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
                                    tables::base64::thintable_epi8[mask1]);
  // we increment by 0x08 the second half of the mask
  shufmask =
      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
  // this is the version "nearly pruned"
  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
  // we still need to put the two halves together.
  // we compute the popcount of the first half:
  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
  // then load the corresponding mask, what it does is to write
  // only the first pop1 bytes from the first 8 bytes, and then
  // it fills in with the bytes from the second 8 bytes + some filling
  // at the end.
  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
      tables::base64::pshufb_combine_table + pop1 * 8));
  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);

  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
}

// --- decoding -----------------------------------------------

template <typename = void>
simdutf_really_inline void compress(__m256i data, uint32_t mask, char *output) {
  if (mask == 0) {
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
    return;
  }
  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
           output + count_ones(~mask & 0xFFFF));
}

template <typename = void>
simdutf_really_inline void base64_decode(char *out, __m256i str) {
  // credit: aqrit
  const __m256i pack_shuffle =
      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);

  // Store the output:
  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
}

template <typename = void>
simdutf_really_inline void base64_decode_block(char *out, const char *src) {
  base64_decode(out,
                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
  base64_decode(out + 24, _mm256_loadu_si256(
                              reinterpret_cast<const __m256i *>(src + 32)));
}

template <typename = void>
simdutf_really_inline void base64_decode_block_safe(char *out,
                                                    const char *src) {
  base64_decode(out,
                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
  alignas(32) char buffer[32]; // We enforce safety with a buffer.
  base64_decode(
      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
  std::memcpy(out + 24, buffer, 24);
}

// --- decoding - base64 class --------------------------------

class block64 {
  __m256i chunks[2];

public:
  // The caller of this function is responsible to ensure that there are 64
  // bytes available from reading at src.
  simdutf_really_inline block64(const char *src) {
    chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
    chunks[1] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
  }

  // The caller of this function is responsible to ensure that there are 128
  // bytes available from reading at src.
  simdutf_really_inline block64(const char16_t *src) {
    const auto m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
    const auto m2 =
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
    const auto m3 =
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
    const auto m4 =
        _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));

    const auto m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
    const auto m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
    const auto m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
    const auto m4p = _mm256_permute2x128_si256(m3, m4, 0x31);

    chunks[0] = _mm256_packus_epi16(m1p, m2p);
    chunks[1] = _mm256_packus_epi16(m3p, m4p);
  }

  simdutf_really_inline void copy_block(char *output) {
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), chunks[0]);
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), chunks[1]);
  }

  // decode 64 bytes and output 48 bytes
  simdutf_really_inline void base64_decode_block(char *out) {
    base64_decode(out, chunks[0]);
    base64_decode(out + 24, chunks[1]);
  }

  simdutf_really_inline void base64_decode_block_safe(char *out) {
    base64_decode(out, chunks[0]);
    alignas(32) char buffer[32]; // We enforce safety with a buffer.
    base64_decode(buffer, chunks[1]);
    std::memcpy(out + 24, buffer, 24);
  }

  template <bool base64_url, bool ignore_garbage, bool default_or_url>
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
    uint32_t err0 = 0;
    uint32_t err1 = 0;
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        &chunks[0], &err0);
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        &chunks[1], &err1);
    if (!ignore_garbage) {
      *error = err0 | ((uint64_t)err1 << 32);
    }
    return m0 | (m1 << 32);
  }

  template <bool base64_url, bool ignore_garbage, bool default_or_url>
  simdutf_really_inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
    const __m256i ascii_space_tbl =
        _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
                         0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
                         0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
    // credit: aqrit
    __m256i delta_asso;
    if (default_or_url) {
      delta_asso = _mm256_setr_epi8(
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
          0x00, 0x00, 0x11, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
    } else if (base64_url) {
      delta_asso = _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
                                    0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1,
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
                                    0x0, 0x0, 0xF, 0x0, 0xF);
    } else {
      delta_asso = _mm256_setr_epi8(
          0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
          0x00, 0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
          0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
    }

    __m256i delta_values;
    if (default_or_url) {
      delta_values = _mm256_setr_epi8(
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9),
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));
    } else if (base64_url) {
      delta_values = _mm256_setr_epi8(
          0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
          uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
          uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
          uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
    } else {
      delta_values = _mm256_setr_epi8(
          int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
          int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
          int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
          int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
          int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
          int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
          int8_t(0xB9), int8_t(0xB9));
    }

    __m256i check_asso;
    if (default_or_url) {
      check_asso = _mm256_setr_epi8(
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
          0x07, 0x0B, 0x0E, 0x0B, 0x06, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
    } else if (base64_url) {
      check_asso = _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
                                    0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1,
                                    0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
                                    0x7, 0xB, 0xE, 0xB, 0x6);
    } else {
      check_asso = _mm256_setr_epi8(
          0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03,
          0x07, 0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
          0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
    }
    __m256i check_values;
    if (default_or_url) {
      check_values = _mm256_setr_epi8(
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80),
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
    } else if (base64_url) {
      check_values = _mm256_setr_epi8(
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
          uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
          0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
          uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
          uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
          uint8_t(0x80), 0x0, uint8_t(0x80));
    } else {
      check_values = _mm256_setr_epi8(
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
          int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
          int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
          int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
          int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
          int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
          int8_t(0x91), int8_t(0x80));
    }
    const __m256i shifted = _mm256_srli_epi32(*src, 3);
    __m256i delta_hash =
        _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
    if (default_or_url) {
      delta_hash = _mm256_and_si256(delta_hash, _mm256_set1_epi8(0xf));
    }
    const __m256i check_hash =
        _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
    const __m256i out =
        _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
    const __m256i chk =
        _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
    const int mask = _mm256_movemask_epi8(chk);
    if (!ignore_garbage && mask) {
      __m256i ascii_space =
          _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
      *error = (mask ^ _mm256_movemask_epi8(ascii_space));
    }
    *src = out;
    return (uint32_t)mask;
  }

  simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
    if (is_power_of_two(mask)) {
      return compress_block_single(mask, output);
    }

    uint64_t nmask = ~mask;
    compress(chunks[0], uint32_t(mask), output);
    compress(chunks[1], uint32_t(mask >> 32),
             output + count_ones(nmask & 0xFFFFFFFF));
    return count_ones(nmask);
  }

  simdutf_really_inline size_t compress_block_single(uint64_t mask,
                                                     char *output) {
    const size_t pos64 = trailing_zeroes(mask);
    const int8_t pos = pos64 & 0xf;
    switch (pos64 >> 4) {
    case 0b00: {
      const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
      const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(lane0, sh);

      _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed);
      _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), lane1);
      _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
    } break;
    case 0b01: {
      const __m128i lane0 = _mm256_extracti128_si256(chunks[0], 0);
      const __m128i lane1 = _mm256_extracti128_si256(chunks[0], 1);
      _mm_storeu_si128((__m128i *)(output + 0 * 16), lane0);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(lane1, sh);

      _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed);
      _mm256_storeu_si256((__m256i *)(output + 2 * 16 - 1), chunks[1]);
    } break;
    case 0b10: {
      const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
      const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);

      _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(lane2, sh);

      _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed);
      _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), lane3);
    } break;
    case 0b11: {
      const __m128i lane2 = _mm256_extracti128_si256(chunks[1], 0);
      const __m128i lane3 = _mm256_extracti128_si256(chunks[1], 1);

      _mm256_storeu_si256((__m256i *)(output + 0 * 16), chunks[0]);
      _mm_storeu_si128((__m128i *)(output + 2 * 16), lane2);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(lane3, sh);

      _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed);
    } break;
    }

    return 63;
  }
};

simdutf_warn_unused size_t avx2_binary_length_from_base64(const char *input,
                                                          size_t length) {
  size_t count = 0;
  const char *ptr = input;
  const char *end = input + length;

  __m256i spaces = _mm256_set1_epi8(0x20);
  while (ptr + 32 <= end) {
    __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
    __m256i gt_space = _mm256_cmpgt_epi8(data, spaces);
    uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(gt_space));
    count += count_ones(mask);
    ptr += 32;
  }

  while (ptr < end) {
    count += (*ptr > 0x20) ? 1 : 0;
    ptr++;
  }

  size_t padding = 0;
  size_t pos = length;
  while (pos > 0 && padding < 2) {
    char c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

simdutf_warn_unused size_t avx2_binary_length_from_base64(const char16_t *input,
                                                          size_t length) {
  size_t count = 0;
  const char16_t *ptr = input;
  const char16_t *end = input + length;

  __m256i spaces = _mm256_set1_epi16(0x20);
  while (ptr + 16 <= end) {
    __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
    __m256i gt_space = _mm256_cmpgt_epi16(data, spaces);
    uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(gt_space));
    count += count_ones(mask);
    ptr += 16;
  }
  count /= 2;

  while (ptr < end) {
    count += (*ptr > 0x20) ? 1 : 0;
    ptr++;
  }

  size_t padding = 0;
  size_t pos = length;
  while (pos > 0 && padding < 2) {
    char16_t c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}
/* end file src/haswell/avx2_base64.cpp */

} // unnamed namespace
} // namespace haswell
} // namespace simdutf

/* begin file src/generic/buf_block_reader.h */
namespace simdutf {
namespace haswell {
namespace {

// Walks through a buffer in block-sized increments, loading the last part with
// spaces
template <size_t STEP_SIZE> struct buf_block_reader {
public:
  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
  simdutf_really_inline size_t block_index();
  simdutf_really_inline bool has_full_block() const;
  simdutf_really_inline const uint8_t *full_block() const;
  /**
   * Get the last block, padded with spaces.
   *
   * There will always be a last block, with at least 1 byte, unless len == 0
   * (in which case this function fills the buffer with spaces and returns 0. In
   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
   * block with STEP_SIZE bytes and no spaces for padding.
   *
   * @return the number of effective characters in the last block.
   */
  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
  simdutf_really_inline void advance();

private:
  const uint8_t *buf;
  const size_t len;
  const size_t lenminusstep;
  size_t idx;
};

template <size_t STEP_SIZE>
simdutf_really_inline
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
      idx{0} {}

template <size_t STEP_SIZE>
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
  return idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
  return idx < lenminusstep;
}

template <size_t STEP_SIZE>
simdutf_really_inline const uint8_t *
buf_block_reader<STEP_SIZE>::full_block() const {
  return &buf[idx];
}

template <size_t STEP_SIZE>
simdutf_really_inline size_t
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
  if (len == idx) {
    return 0;
  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
  std::memset(dst, 0x20,
              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
                          // to write out 8 or 16 bytes at once.
  std::memcpy(dst, buf + idx, len - idx);
  return len - idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
  idx += STEP_SIZE;
}

} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/buf_block_reader.h */
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8_validation {

using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  static const uint8_t max_array[32] = {255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        0b11110000u - 1,
                                        0b11100000u - 1,
                                        0b11000000u - 1};
  const simd8<uint8_t> max_value(
      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
  return input.gt_bits(max_value);
}

struct utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
  // The last input we received
  simd8<uint8_t> prev_input_block;
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  simd8<uint8_t> prev_incomplete;

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error |= this->prev_incomplete;
  }

  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
    if (simdutf_likely(is_ascii(input))) {
      this->error |= this->prev_incomplete;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
                    "We support either two or four chunks per 64-byte block.");
      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
      }
      this->prev_incomplete =
          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
    }
  }

  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_validation

using utf8_validation::utf8_checker;

} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
/* begin file src/generic/utf8_validation/utf8_validator.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8_validation {

/**
 * Validates that the string is actual UTF-8.
 */
template <class checker>
bool generic_validate_utf8(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  return !c.errors();
}

bool generic_validate_utf8(const char *input, size_t length) {
  return generic_validate_utf8<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

/**
 * Validates that the string is actual UTF-8 and stops on errors.
 */
template <class checker>
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    if (c.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(input),
          reinterpret_cast<const char *>(input + count), length - count);
      res.count += count;
      return res;
    }
    reader.advance();
    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  if (c.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(input),
        reinterpret_cast<const char *>(input) + count, length - count);
    res.count += count;
    return res;
  } else {
    return result(error_code::SUCCESS, length);
  }
}

result generic_validate_utf8_with_errors(const char *input, size_t length) {
  return generic_validate_utf8_with_errors<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

} // namespace utf8_validation
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_validator.h */

/* begin file src/generic/ascii_validation.h */
namespace simdutf {
namespace haswell {
namespace {
namespace ascii_validation {

result generic_validate_ascii_with_errors(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      result res = scalar::ascii::validate_with_errors(
          reinterpret_cast<const char *>(input + count), length - count);
      return result(res.error, count + res.count);
    }
    reader.advance();

    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  if (!in.is_ascii()) {
    result res = scalar::ascii::validate_with_errors(
        reinterpret_cast<const char *>(input + count), length - count);
    return result(res.error, count + res.count);
  } else {
    return result(error_code::SUCCESS, length);
  }
}

bool generic_validate_ascii(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      return false;
    }
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  return in.is_ascii();
}

} // namespace ascii_validation
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/ascii_validation.h */

  // transcoding from UTF-8 to UTF-32
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8_to_utf32 {

using namespace simd;

simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char32_t *utf32_output) noexcept {
  size_t pos = 0;
  char32_t *start{utf32_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf32(utf32_output);
      utf32_output += 64;
      pos += 64;
    } else {
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      size_t max_starting_point = (pos + 64) - 12;
      while (pos < max_starting_point) {
        size_t consumed = convert_masked_utf8_to_utf32(
            input + pos, utf8_end_of_code_point_mask, utf32_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
    }
  }
  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
                                                       utf32_output);
  return utf32_output - start;
}

} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8_to_utf32 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (utf8_continuation_mask & 1) {
          return 0; // we have an error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
      if (howmany == 0) {
        return 0;
      }
      utf32_output += howmany;
    }
    return utf32_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (errors() || (utf8_continuation_mask & 1)) {
          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, utf32_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        utf32_output += res.count;
      }
    }
    return result(error_code::SUCCESS, utf32_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
/* begin file src/generic/utf32.h */
#include <limits>

namespace simdutf {
namespace haswell {
namespace {
namespace utf32 {

template <typename T> T min(T a, T b) { return a <= b ? a : b; }

simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
                                                    size_t length) {
  using vector_u32 = simd32<uint32_t>;

  const char32_t *start = input;

  // we add up to three ones in a single iteration (see the vectorized loop in
  // section #2 below)
  const size_t max_increment = 3;

  const size_t N = vector_u32::ELEMENTS;

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
  const auto v_0000007f = vector_u32::splat(0x0000007f);
  const auto v_000007ff = vector_u32::splat(0x000007ff);
  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
#else
  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
  const auto v_fffff800 = vector_u32::splat(0xfffff800);
  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto one = vector_u32::splat(1);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

  size_t counter = 0;

  // 1. vectorized loop unrolled 4 times
  {
    // we use vector of uint32 counters, this is why this limit is used
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
    size_t blocks = length / (N * 4);
    length -= blocks * (N * 4);
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      simd32<uint32_t> acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in0 = vector_u32(input + 0 * N);
        const auto in1 = vector_u32(input + 1 * N);
        const auto in2 = vector_u32(input + 2 * N);
        const auto in3 = vector_u32(input + 3 * N);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in0 > v_0000007f);
        acc -= as_vector_u32(in1 > v_0000007f);
        acc -= as_vector_u32(in2 > v_0000007f);
        acc -= as_vector_u32(in3 > v_0000007f);

        acc -= as_vector_u32(in0 > v_000007ff);
        acc -= as_vector_u32(in1 > v_000007ff);
        acc -= as_vector_u32(in2 > v_000007ff);
        acc -= as_vector_u32(in3 > v_000007ff);

        acc -= as_vector_u32(in0 > v_0000ffff);
        acc -= as_vector_u32(in1 > v_0000ffff);
        acc -= as_vector_u32(in2 > v_0000ffff);
        acc -= as_vector_u32(in3 > v_0000ffff);
#else
        acc += min(one, in0 & v_ffffff80);
        acc += min(one, in1 & v_ffffff80);
        acc += min(one, in2 & v_ffffff80);
        acc += min(one, in3 & v_ffffff80);

        acc += min(one, in0 & v_fffff800);
        acc += min(one, in1 & v_fffff800);
        acc += min(one, in2 & v_fffff800);
        acc += min(one, in3 & v_fffff800);

        acc += min(one, in0 & v_ffff0000);
        acc += min(one, in1 & v_ffff0000);
        acc += min(one, in2 & v_ffff0000);
        acc += min(one, in3 & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += 4 * N;
      }

      counter += acc.sum();
    }
  }

  // 2. vectorized loop for tail
  {
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / max_increment;
    size_t blocks = length / N;
    length -= blocks * N;
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      auto acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in = vector_u32(input);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in > v_0000007f);
        acc -= as_vector_u32(in > v_000007ff);
        acc -= as_vector_u32(in > v_0000ffff);
#else
        acc += min(one, in & v_ffffff80);
        acc += min(one, in & v_fffff800);
        acc += min(one, in & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += N;
      }

      counter += acc.sum();
    }
  }

  const size_t consumed = input - start;
  if (consumed != 0) {
    // We don't count 0th bytes in the vectorized loops above, this
    // is why we need to count them in the end.
    counter += consumed;
  }

  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
}

} // namespace utf32
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf32.h */

// other functions
/* begin file src/generic/utf8.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8 {

using namespace simd;

simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.gt(-65);
    count += count_ones(utf8_continuation_mask);
  }
  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
                                                        size_t size) {
  using vector_i8 = simd8<int8_t>;
  using vector_u8 = simd8<uint8_t>;
  using vector_u64 = simd64<uint64_t>;

  constexpr size_t N = vector_i8::SIZE;
  constexpr size_t max_iterations = 255 / 4;

  size_t pos = 0;
  size_t count = 0;

  auto counters = vector_u64::zero();
  auto local = vector_u8::zero();
  size_t iterations = 0;
  for (; pos + 4 * N <= size; pos += 4 * N) {
    const auto input0 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
    const auto input1 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
    const auto input2 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
    const auto input3 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
    const auto mask0 = input0 > int8_t(-65);
    const auto mask1 = input1 > int8_t(-65);
    const auto mask2 = input2 > int8_t(-65);
    const auto mask3 = input3 > int8_t(-65);

    local -= vector_u8(mask0);
    local -= vector_u8(mask1);
    local -= vector_u8(mask2);
    local -= vector_u8(mask3);

    iterations += 1;
    if (iterations == max_iterations) {
      counters += sum_8bytes(local);
      local = vector_u8::zero();
      iterations = 0;
    }
  }

  if (iterations > 0) {
    count += local.sum_bytes();
  }

  count += counters.sum();

  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
    // We count one word for anything that is not a continuation (so
    // leading bytes).
    count += 64 - count_ones(utf8_continuation_mask);
    int64_t utf8_4byte = input.gteq_unsigned(240);
    count += count_ones(utf8_4byte);
  }
  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
}

} // namespace utf8
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf8.h */

  // transcoding from UTF-8 to Latin 1
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
  // 0b11000010 and nothing else.
  //
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
  constexpr const uint8_t FORBIDDEN = 0xff;

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      FORBIDDEN,
      // 1110____ ________ <three byte lead in byte 1>
      FORBIDDEN,
      // 1111____ ________ <four+ byte lead in byte 1>
      FORBIDDEN);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              FORBIDDEN,
              // ____0101 ________
              FORBIDDEN,
              // ____011_ ________
              FORBIDDEN, FORBIDDEN,

              // ____1___ ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
              // ____1101 ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    this->error |= check_special_cases(input, prev1);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 16; margin--) {
      leading_byte += (int8_t(in[margin - 1]) >
                       -65); // twos complement of -65 is 1011 1111 ...
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask =
            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                               // this case, we also have ASCII to account for.
        if (utf8_continuation_mask & 1) {
          return 0; // error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
      if (howmany == 0) {
        return 0;
      }
      latin1_output += howmany;
    }
    return latin1_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        if (errors()) {
          // rewind_and_convert_with_errors will seek a potential error from
          // in+pos onward, with the ability to go back up to pos bytes, and
          // read size-pos bytes forward.
          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, latin1_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        latin1_output += res.count;
      }
    }
    return result(error_code::SUCCESS, latin1_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_latin1
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace haswell
} // namespace simdutf
  // namespace simdutf
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */

/* begin file src/generic/validate_utf32.h */
namespace simdutf {
namespace haswell {
namespace {
namespace utf32 {

simdutf_really_inline bool validate(const char32_t *input, size_t size) {
  if (simdutf_unlikely(size == 0)) {
    // empty input is valid UTF-32. protect the implementation from
    // handling nullptr
    return true;
  }

  const char32_t *end = input + size;

  using vector_u32 = simd32<uint32_t>;

  const auto standardmax = vector_u32::splat(0x10ffff);
  const auto offset = vector_u32::splat(0xffff2000);
  const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
  auto currentmax = vector_u32::zero();
  auto currentoffsetmax = vector_u32::zero();

  constexpr size_t N = vector_u32::ELEMENTS;

  while (input + N < end) {
    auto in = vector_u32(input);
    if constexpr (!match_system(endianness::BIG)) {
      in.swap_bytes();
    }

    currentmax = max(currentmax, in);
    currentoffsetmax = max(currentoffsetmax, in + offset);
    input += N;
  }

  const auto too_large = currentmax > standardmax;
  if (too_large.any()) {
    return false;
  }

  const auto surrogate = currentoffsetmax > standardoffsetmax;
  if (surrogate.any()) {
    return false;
  }

  return scalar::utf32::validate(input, end - input);
}

simdutf_really_inline result validate_with_errors(const char32_t *input,
                                                  size_t size) {
  if (simdutf_unlikely(size == 0)) {
    // empty input is valid UTF-32. protect the implementation from
    // handling nullptr
    return result(error_code::SUCCESS, 0);
  }

  const char32_t *start = input;
  const char32_t *end = input + size;

  using vector_u32 = simd32<uint32_t>;

  const auto standardmax = vector_u32::splat(0x10ffff + 1);
  const auto surrogate_mask = vector_u32::splat(0xfffff800);
  const auto surrogate_byte = vector_u32::splat(0x0000d800);

  constexpr size_t N = vector_u32::ELEMENTS;

  while (input + N < end) {
    auto in = vector_u32(input);
    if constexpr (!match_system(endianness::BIG)) {
      in.swap_bytes();
    }

    const auto too_large = in >= standardmax;
    const auto surrogate = (in & surrogate_mask) == surrogate_byte;

    const auto combined = too_large | surrogate;
    if (simdutf_unlikely(combined.any())) {
      const size_t consumed = input - start;
      auto sr = scalar::utf32::validate_with_errors(input, end - input);
      sr.count += consumed;

      return sr;
    }

    input += N;
  }

  const size_t consumed = input - start;
  auto sr = scalar::utf32::validate_with_errors(input, end - input);
  sr.count += consumed;

  return sr;
}

} // namespace utf32
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/validate_utf32.h */

/* begin file src/generic/base64.h */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */
namespace simdutf {
namespace haswell {
namespace {
namespace base64 {

/*
    The following template function implements API for Base64 decoding.

    An implementation is responsible for providing the `block64` type and
    associated methods that perform actual conversion. Please refer
    to any vectorized implementation to learn the API of these procedures.
*/
template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename chartype>
full_result
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t equalsigns = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && equalsigns > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0, true};
    }
    return {SUCCESS, full_input_length, 0};
  }
  char *end_of_safe_64byte_zone =
      dst == nullptr
          ? nullptr
          : ((srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63
                                        : dst);

  const chartype *const srcinit = src;
  const char *const dstinit = dst;
  const chartype *const srcend = src + srclen;

  constexpr size_t block_size = 6;
  static_assert(block_size >= 2, "block_size must be at least two");
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const chartype *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b(src);
      src += 64;
      uint64_t error = 0;
      const uint64_t badcharmask =
          b.to_base64_mask<base64_url, ignore_garbage, default_or_url>(&error);
      if (!ignore_garbage && error) {
        src -= 64;
        const size_t error_offset = trailing_zeroes(error);
        return {error_code::INVALID_BASE64_CHARACTER,
                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
      }
      if (badcharmask != 0) {
        bufferptr += b.compress_block(badcharmask, bufferptr);
      } else if (bufferptr != buffer) {
        b.copy_block(bufferptr);
        bufferptr += 64;
      } else {
        if (dst >= end_of_safe_64byte_zone) {
          b.base64_decode_block_safe(dst);
        } else {
          b.base64_decode_block(dst);
        }
        dst += 48;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 2); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        if (dst >= end_of_safe_64byte_zone) {
          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
        } else {
          base64_decode_block(dst, buffer + (block_size - 2) * 64);
        }
        dst += 48;
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }

  char *buffer_start = buffer;
  // Optimization note: if this is almost full, then it is worth our
  // time, otherwise, we should just decode directly.
  int last_block = (int)((bufferptr - buffer_start) % 64);
  if (last_block != 0 && srcend - src + last_block >= 64) {

    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
      uint8_t val = to_base64[uint8_t(*src)];
      *bufferptr = char(val);
      if (!ignore_garbage &&
          (!scalar::base64::is_eight_byte(*src) || val > 64)) {
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      bufferptr += (val <= 63);
      src++;
    }
  }

  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    if (dst >= end_of_safe_64byte_zone) {
      base64_decode_block_safe(dst, buffer_start);
    } else {
      base64_decode_block(dst, buffer_start);
    }
    dst += 48;
  }
  if ((bufferptr - buffer_start) % 64 != 0) {
    while (buffer_start + 4 < bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    if (buffer_start + 4 <= bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
    // backtrack
    int leftover = int(bufferptr - buffer_start);
    while (leftover > 0) {
      if (!ignore_garbage) {
        while (to_base64[uint8_t(*(src - 1))] == 64) {
          src--;
        }
      } else {
        while (to_base64[uint8_t(*(src - 1))] >= 64) {
          src--;
        }
      }
      src--;
      leftover--;
    }
  }
  if (src < srcend + equalsigns) {
    full_result r = scalar::base64::base64_tail_decode(
        dst, src, srcend - src, equalsigns, options, last_chunk_options);
    r = scalar::base64::patch_tail_result(
        r, size_t(src - srcinit), size_t(dst - dstinit), equallocation,
        full_input_length, last_chunk_options);
    // When is_partial(last_chunk_options) is true, we must either end with
    // the end of the stream (beyond whitespace) or right after a non-ignorable
    // character or at the very beginning of the stream.
    // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
        r.input_count < full_input_length) {
      // First check if we can extend the input to the end of the stream
      while (r.input_count < full_input_length &&
             base64_ignorable(*(srcinit + r.input_count), options)) {
        r.input_count++;
      }
      // If we are still not at the end of the stream, then we must backtrack
      // to the last non-ignorable character.
      if (r.input_count < full_input_length) {
        while (r.input_count > 0 &&
               base64_ignorable(*(srcinit + r.input_count - 1), options)) {
          r.input_count--;
        }
      }
    }
    return r;
  }
  if (!ignore_garbage && equalsigns > 0) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit),
              true};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}

} // namespace base64
} // unnamed namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/base64.h */
/* begin file src/generic/find.h */
namespace simdutf {
namespace haswell {
namespace {
namespace util {

simdutf_really_inline const char *find(const char *start, const char *end,
                                       char character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  // Align the start pointer to 64 bytes
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % 64;
  if (misalignment != 0) {
    size_t adjustment = 64 - misalignment;
    if (size_t(std::distance(start, end)) < adjustment) {
      adjustment = std::distance(start, end);
    }
    for (size_t i = 0; i < adjustment; i++) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for 64-byte aligned data
  for (; std::distance(start, end) >= 64; start += 64) {
    simd8x64<uint8_t> input(reinterpret_cast<const uint8_t *>(start));
    uint64_t matches = input.eq(uint8_t(character));
    if (matches != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(matches);
      return start + index;
    }
  }
  return std::find(start, end, character);
}

simdutf_really_inline const char16_t *
find(const char16_t *start, const char16_t *end, char16_t character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  // Align the start pointer to 64 bytes if misalignment is even
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % 64;
  if (misalignment != 0 && misalignment % 2 == 0) {
    size_t adjustment = (64 - misalignment) / sizeof(char16_t);
    if (size_t(std::distance(start, end)) < adjustment) {
      adjustment = std::distance(start, end);
    }
    for (size_t i = 0; i < adjustment; i++) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for 64-byte aligned data
  for (; std::distance(start, end) >= 32; start += 32) {
    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(start));
    uint64_t matches = input.eq(uint16_t(character));
    if (matches != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(matches) / 2;
      return start + index;
    }
  }
  return std::find(start, end, character);
}

} // namespace util
} // namespace
} // namespace haswell
} // namespace simdutf
/* end file src/generic/find.h */

namespace simdutf {
namespace haswell {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return haswell::utf8_validation::generic_validate_utf8(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return haswell::ascii_validation::generic_validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return haswell::ascii_validation::generic_validate_ascii_with_errors(buf,
                                                                       len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  return utf32::validate(buf, len);
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  return utf32::validate_with_errors(buf, len);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char *, char *> ret =
      avx2_convert_latin1_to_utf8(buf, len, utf8_output);
  size_t converted_chars = ret.second - utf8_output;

  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }

  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  std::pair<const char *, char32_t *> ret =
      avx2_convert_latin1_to_utf32(buf, len, utf32_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t converted_chars = ret.second - utf32_output;
  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_converted_chars == 0) {
      return 0;
    }
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *input, size_t size, char *latin1_output) const noexcept {
  return utf8_to_latin1::convert_valid(input, size, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      avx2_convert_utf32_to_utf8(buf, len, utf8_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - utf8_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      avx2_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      latin1_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return convert_utf32_to_latin1(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      utf8_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return convert_utf32_to_utf8(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *in, size_t size) const noexcept {
  return utf8::count_code_points_bytemask(in, size);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t len) const noexcept {
  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
  size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
  size_t i = 0;
  if (answer >= 2048) { // long strings optimization
    __m256i four_64bits = _mm256_setzero_si256();
    while (i + sizeof(__m256i) <= len) {
      __m256i runner = _mm256_setzero_si256();
      // We can do up to 255 loops without overflow.
      size_t iterations = (len - i) / sizeof(__m256i);
      if (iterations > 255) {
        iterations = 255;
      }
      size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
      for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
        __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
        __m256i input2 =
            _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
        __m256i input3 = _mm256_loadu_si256(
            (const __m256i *)(data + i + 2 * sizeof(__m256i)));
        __m256i input4 = _mm256_loadu_si256(
            (const __m256i *)(data + i + 3 * sizeof(__m256i)));
        __m256i input12 =
            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
        __m256i input23 =
            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
        __m256i input1234 = _mm256_add_epi8(input12, input23);
        runner = _mm256_sub_epi8(runner, input1234);
      }
      for (; i <= max_i; i += sizeof(__m256i)) {
        __m256i input_256_chunk =
            _mm256_loadu_si256((const __m256i *)(data + i));
        runner = _mm256_sub_epi8(
            runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
      }
      four_64bits = _mm256_add_epi64(
          four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
    }
    answer += _mm256_extract_epi64(four_64bits, 0) +
              _mm256_extract_epi64(four_64bits, 1) +
              _mm256_extract_epi64(four_64bits, 2) +
              _mm256_extract_epi64(four_64bits, 3);
  } else if (answer > 0) {
    for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
      __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
      uint32_t non_ascii = _mm256_movemask_epi8(latin);
      answer += count_ones(non_ascii);
    }
  }
  return answer + scalar::latin1::utf8_length_from_latin1(
                      reinterpret_cast<const char *>(data + i), len - i);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64<true>(output, input, length, options);
  } else {
    return encode_base64<false>(output, input, length, options);
  }
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  if (options & base64_url) {
    return avx2_encode_base64_impl<true, true>(output, input, length, options,
                                               line_length);
  } else {
    return avx2_encode_base64_impl<false, true>(output, input, length, options,
                                                line_length);
  }
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util::find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util::find(start, end, character);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return avx2_binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return avx2_binary_length_from_base64(input, length);
}

} // namespace haswell
} // namespace simdutf

/* begin file src/simdutf/haswell/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif

#undef SIMDUTF_SIMD_HAS_BYTEMASK

#if SIMDUTF_GCC11ORMORE // workaround for
                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
SIMDUTF_POP_DISABLE_WARNINGS
#endif // end of workaround
/* end file src/simdutf/haswell/end.h */
/* end file src/haswell/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_PPC64
/* begin file src/ppc64/implementation.cpp */
/* begin file src/simdutf/ppc64/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
// #define SIMDUTF_IMPLEMENTATION ppc64
/* end file src/simdutf/ppc64/begin.h */

/* begin file src/ppc64/ppc64_utf16_to_utf8_tables.h */
// Code generated automatically; DO NOT EDIT
// file generated by scripts/ppc64_convert_utf16_to_utf8.py
#ifndef PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
#define PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H

namespace simdutf {
namespace {
namespace tables {
namespace ppc64_utf16_to_utf8 {

#if SIMDUTF_IS_BIG_ENDIAN
// 1 byte for length, 16 bytes for mask
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
    {12, 1, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80},
    {9, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 17, 3, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 1, 0, 16, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {11, 1, 0, 16, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 0, 16, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 17, 2, 18, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 1, 0, 16, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 16, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 17, 19, 5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 0, 16, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 3, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 1, 0, 16, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 1, 0, 16, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 2, 18, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 0, 16, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 19, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {11, 1, 0, 16, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 0, 16, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 17, 3, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 1, 0, 16, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 1, 0, 16, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 16, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 17, 2, 18, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 0, 16, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 19, 4, 20, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 1, 0, 16, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 16, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 17, 3, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 1, 0, 16, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 1, 0, 16, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 2, 18, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 1, 0, 16, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 19, 21, 7, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 1, 0, 16, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 3, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 1, 0, 16, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 1, 0, 16, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 2, 18, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 0, 16, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 19, 5, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 1, 0, 16, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 3, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 1, 0, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {2, 0, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {5, 1, 0, 16, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 0, 16, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 17, 2, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 1, 0, 16, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 0, 16, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 17, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {8, 1, 0, 16, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 3, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 1, 0, 16, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 0, 16, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 17, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 1, 0, 16, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 2, 18, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 1, 0, 16, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 19, 4, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 1, 0, 16, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 1, 0, 16, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 0, 16, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 17, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {6, 1, 0, 16, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 2, 18, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 0, 16, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 0, 16, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 17, 19, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {11, 1, 0, 16, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 0, 16, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 17, 3, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 1, 0, 16, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 1, 0, 16, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 16, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 17, 2, 18, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 0, 16, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 19, 5, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 1, 0, 16, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 3, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 1, 0, 16, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 0, 16, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 17, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 1, 0, 16, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 2, 18, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 1, 0, 16, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 19, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {10, 1, 0, 16, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 16, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 17, 3, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 1, 0, 16, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 1, 0, 16, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 2, 18, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 1, 0, 16, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 19, 4, 20, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 1, 0, 16, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 3, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 1, 0, 16, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 1, 0, 16, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 2, 18, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 0, 16, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 19, 21, 6, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 1, 0, 16, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 16, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 17, 3, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 1, 0, 16, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 1, 0, 16, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 2, 18, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 1, 0, 16, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 19, 5, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 0, 16, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 1, 0, 16, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 0, 16, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 17, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {6, 1, 0, 16, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 2, 18, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 0, 16, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 0, 16, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 17, 19, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {9, 1, 0, 16, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 16, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 17, 3, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 1, 0, 16, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 1, 0, 16, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 2, 18, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 0, 16, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 19, 4, 20, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 0, 16, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 16, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 17, 3, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 1, 0, 16, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 0, 16, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 17, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 1, 0, 16, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 0, 16, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 17, 2, 18, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 1, 0, 16, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 16, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 17, 19, 21, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
};
#else
// 1 byte for length, 16 bytes for mask
const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
    {12, 0, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80},
    {9, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {11, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
    {10, 16, 2, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 0, 1, 17, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {11, 0, 1, 17, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 1, 17, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 16, 3, 19, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 0, 1, 17, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 17, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 16, 18, 4, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 1, 17, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 2, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 1, 17, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 0, 1, 17, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 3, 19, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 1, 17, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 18, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {11, 0, 1, 17, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 1, 17, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 16, 2, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 1, 17, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 0, 1, 17, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 17, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 16, 3, 19, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 1, 17, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 18, 5, 21, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 0, 1, 17, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 17, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 16, 2, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 1, 17, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 0, 1, 17, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 3, 19, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 1, 17, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 18, 20, 6, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 0, 1, 17, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {6, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 2, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 1, 17, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 0, 1, 17, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 3, 19, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 1, 17, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 18, 4, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 0, 1, 17, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 0, 1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {2, 1, 17, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 16, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {5, 0, 1, 17, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 1, 17, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 16, 3, 19, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 0, 1, 17, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 1, 17, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 16, 18, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {8, 0, 1, 17, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 2, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 1, 17, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 1, 17, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 16, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 0, 1, 17, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 3, 19, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 0, 1, 17, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 18, 5, 21, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 0, 1, 17, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 2, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 1, 17, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 1, 17, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 16, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {6, 0, 1, 17, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 3, 19, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 1, 17, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 1, 17, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 16, 18, 20, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {11, 0, 1, 17, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80},
    {8, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {10, 1, 17, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {9, 16, 2, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 1, 17, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 0, 1, 17, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 17, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 16, 3, 19, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 0, 1, 17, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 18, 4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 1, 17, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 2, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 1, 17, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {4, 1, 17, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 16, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 0, 1, 17, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 3, 19, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 0, 1, 17, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 18, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {10, 0, 1, 17, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 17, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 16, 2, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 1, 17, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 0, 1, 17, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 3, 19, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 1, 17, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 18, 5, 21, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 0, 1, 17, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 2, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 1, 17, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 0, 1, 17, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 3, 19, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 1, 17, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 18, 20, 7, 23, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {10, 0, 1, 17, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
    {7, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {9, 1, 17, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 16, 2, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 0, 1, 17, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {9, 0, 1, 17, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 3, 19, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {8, 0, 1, 17, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 18, 4, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 1, 17, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 2, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 0, 1, 17, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {1, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {3, 1, 17, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {2, 16, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {6, 0, 1, 17, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 3, 19, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 0, 1, 17, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 1, 17, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 16, 18, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {9, 0, 1, 17, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 1, 17, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {7, 16, 2, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 0, 1, 17, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {8, 0, 1, 17, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 3, 19, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 0, 1, 17, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 18, 5, 21, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {8, 0, 1, 17, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {5, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {7, 1, 17, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {6, 16, 2, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 0, 1, 17, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {2, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80, 0x80},
    {4, 1, 17, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {3, 16, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {7, 0, 1, 17, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80},
    {4, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {6, 1, 17, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {5, 16, 3, 19, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {6, 0, 1, 17, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {3, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
    {5, 1, 17, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80},
    {4, 16, 18, 20, 22, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     0x80, 0x80, 0x80},
};
#endif // SIMDUTF_IS_BIG_ENDIAN
} // namespace ppc64_utf16_to_utf8
} // namespace tables
} // unnamed namespace
} // namespace simdutf

#endif // PPC64_SIMDUTF_UTF16_TO_UTF8_TABLES_H
/* end file src/ppc64/ppc64_utf16_to_utf8_tables.h */

namespace simdutf {
namespace ppc64 {
namespace {
#ifndef SIMDUTF_PPC64_H
  #error "ppc64.h must be included"
#endif
using namespace simd;

simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
  // careful: 0x80 is not ascii.
  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
}

simdutf_really_inline simd8<bool>
must_be_2_3_continuation(const simd8<uint8_t> prev2,
                         const simd8<uint8_t> prev3) {
  simd8<uint8_t> is_third_byte =
      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
  simd8<uint8_t> is_fourth_byte =
      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
  // Caller requires a bool (all 1's). All values resulting from the subtraction
  // will be <= 64, so signed comparison is fine.
  return simd8<bool>(is_third_byte | is_fourth_byte);
}

/// ErrorReporting describes behaviour of a vectorized procedure regarding error
/// checking
enum class ErrorReporting {
  precise,    // the procedure will report *approximate* or *precise* error
              // position
  at_the_end, // the procedure will only inform about an error after scanning
              // the whole input (or its significant portion)
  none,       // no error checking is done, we assume valid inputs
};

/* begin file src/ppc64/ppc64_write_to_utf8.cpp */
/*
 * reads a vector of uint16 values
 * bits after 11th are ignored
 * first 11 bits are encoded into utf8
 * !important! utf8_output must have at least 16 writable bytes
 */
simdutf_really_inline void
write_v_u16_11bits_to_utf8(const vector_u16 v_u16, char *&utf8_output,
                           const vector_u8 one_byte_bytemask,
                           const uint16_t one_byte_bitmask) {

  // 0b1100_0000_1000_0000
  const auto v_c080 = vector_u16(0xc080);
  // 0b0011_1111_0000_0000
  const auto v_1f00 = vector_u16(0x1f00);
  // 0b0000_0000_0011_1111
  const auto v_003f = vector_u16(0x003f);

  // 1. prepare 2-byte values
  // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
  // expected output   : [110a|aaaa|10bb|bbbb] x 8

  // t0 = [0000|0000|00bb|bbbb]
  const auto t0 = v_u16 & v_003f;
  // t1 = [000a|aaaa|bbbb|bb00]
  const auto t1 = v_u16.shl<2>();
  // t2 = [000a|aaaa|00bb|bbbb]
  const auto t2 = select(v_1f00, t1, t0);
  // t3 = [110a|aaaa|10bb|bbbb]
  const auto t3 = t2 | v_c080;

  // 2. merge ASCII and 2-byte codewords
  const auto utf8_unpacked1 =
      select(one_byte_bytemask, as_vector_u8(v_u16), as_vector_u8(t3));

#if SIMDUTF_IS_BIG_ENDIAN
  const auto tmp = as_vector_u16(utf8_unpacked1).swap_bytes();
#else
  const auto tmp = as_vector_u16(utf8_unpacked1);
#endif // SIMDUTF_IS_BIG_ENDIAN
  const auto utf8_unpacked = as_vector_u8(tmp);

  // 3. prepare bitmask for 8-bit lookup
  //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
  //    - LSB)
  const uint16_t m0 = one_byte_bitmask & 0x5555;      // m0 = 0h0g0f0e0d0c0b0a
  const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
  const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
  // 4. pack the bytes
  const uint8_t *row =
      &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
  const auto shuffle = vector_u8::load(row + 1);
  const auto utf8_packed = shuffle.lookup_16(utf8_unpacked);

  // 5. store bytes
  utf8_packed.store(utf8_output);

  // 6. adjust pointers
  utf8_output += row[0];
}

inline void write_v_u16_11bits_to_utf8(const vector_u16 v_u16,
                                       char *&utf8_output,
                                       const vector_u16 v_0000,
                                       const vector_u16 v_ff80) {
  // no bits set above 7th bit
  const auto one_byte_bytemask = (v_u16 & v_ff80) == v_0000;
  const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask();

  write_v_u16_11bits_to_utf8(v_u16, utf8_output,
                             as_vector_u8(one_byte_bytemask), one_byte_bitmask);
}
/* end file src/ppc64/ppc64_write_to_utf8.cpp */

/* begin file src/ppc64/ppc64_convert_latin1_to_utf8.cpp */
std::pair<const char *const, char *const>
ppc64_convert_latin1_to_utf8(const char *latin_input,
                             const size_t latin_input_length,
                             char *utf8_output) {
  const char *end = latin_input + latin_input_length;

  const auto v_0000 = vector_u16::zero();
  const auto v_00 = vector_u8::zero();

  // 0b1111_1111_1000_0000
  const auto v_ff80 = vector_u16(0xff80);

#if SIMDUTF_IS_BIG_ENDIAN
  const auto latin_1_half_into_u16_byte_mask =
      vector_u8(16, 0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7);
  const auto latin_2_half_into_u16_byte_mask =
      vector_u8(16, 8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15);
#else
  const auto latin_1_half_into_u16_byte_mask =
      vector_u8(0, 16, 1, 16, 2, 16, 3, 16, 4, 16, 5, 16, 6, 16, 7, 16);
  const auto latin_2_half_into_u16_byte_mask =
      vector_u8(8, 16, 9, 16, 10, 16, 11, 16, 12, 16, 13, 16, 14, 16, 15, 16);
#endif // SIMDUTF_IS_BIG_ENDIAN

  // each latin1 takes 1-2 utf8 bytes
  // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
  // adjust the pointer) so the last write can exceed the utf8_output size by
  // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
  // 8-16 bytes free
  while (end - latin_input >= 16 + 8) {
    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
    const auto v_latin = vector_u8::load(latin_input);

    if (v_latin.is_ascii()) { // ASCII fast path!!!!
      v_latin.store(utf8_output);
      latin_input += 16;
      utf8_output += 16;
      continue;
    }

    // assuming a/b are bytes and A/B are uint16 of the same value
    // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
    const vector_u16 v_u16_latin_1_half =
        as_vector_u16(latin_1_half_into_u16_byte_mask.lookup_32(v_latin, v_00));

    // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
    const vector_u16 v_u16_latin_2_half =
        as_vector_u16(latin_2_half_into_u16_byte_mask.lookup_32(v_latin, v_00));

    write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
    write_v_u16_11bits_to_utf8(v_u16_latin_2_half, utf8_output, v_0000, v_ff80);
    latin_input += 16;
  }

  if (end - latin_input >= 16) {
    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
    const auto v_latin = vector_u8::load(latin_input);

    if (v_latin.is_ascii()) { // ASCII fast path!!!!
      v_latin.store(utf8_output);
      latin_input += 16;
      utf8_output += 16;
    } else {
      // assuming a/b are bytes and A/B are uint16 of the same value
      // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
      const auto v_u16_latin_1_half = as_vector_u16(
          latin_1_half_into_u16_byte_mask.lookup_32(v_latin, v_00));

      write_v_u16_11bits_to_utf8(v_u16_latin_1_half, utf8_output, v_0000,
                                 v_ff80);
      latin_input += 8;
    }
  }

  return std::make_pair(latin_input, utf8_output);
}
/* end file src/ppc64/ppc64_convert_latin1_to_utf8.cpp */

/* begin file src/ppc64/ppc64_convert_latin1_to_utf32.cpp */
std::pair<const char *, char32_t *>
ppc64_convert_latin1_to_utf32(const char *buf, size_t len,
                              char32_t *utf32_output) {
  const size_t rounded_len = align_down<vector_u8::ELEMENTS>(len);

  for (size_t i = 0; i < rounded_len; i += vector_u8::ELEMENTS) {
    const auto in = vector_u8::load(&buf[i]);
    in.store_bytes_as_utf32(&utf32_output[i]);
  }

  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
}
/* end file src/ppc64/ppc64_convert_latin1_to_utf32.cpp */

/* begin file src/ppc64/ppc64_convert_utf8_to_latin1.cpp */
// depends on "tables/utf8_to_utf16_tables.h"

// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_latin1(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
                                     char *&latin1_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  const auto in = vector_u8::load(input);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask &
      0xfff; // we are only processing 12 bytes in case it is not all ASCII
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process the data in chunks of 12 bytes.
    in.store(latin1_output);
    latin1_output += 12; // We wrote 12 characters.
    return 12;           // We consumed 12 bytes.
  }
  /// We do not have a fast path available, so we fallback.
  const uint8_t idx =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
  // this indicates an invalid input:
  if (idx >= 64) {
    return consumed;
  }
  // Here we should have (idx < 64), if not, there is a bug in the validation or
  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
  // processors where pdep/pext is fast, we might be able to use a small lookup
  // table.

  const auto reshuffle = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
  const auto perm8 = reshuffle.lookup_32(in, vector_u8::zero());
#if SIMDUTF_IS_BIG_ENDIAN
  const auto perm16 = as_vector_u16(perm8).swap_bytes();
#else
  const auto perm16 = as_vector_u16(perm8);
#endif // SIMDUTF_IS_BIG_ENDIAN
  const auto ascii = perm16 & uint16_t(0x7f);
  const auto highbyte = perm16 & uint16_t(0x1f00);
  const auto composed = ascii | highbyte.shr<2>();

  const auto latin1_packed = vector_u16::pack(composed, composed);
#if defined(__clang__)
  __attribute__((aligned(16))) char buf[16];
  latin1_packed.store(buf);
  memcpy(latin1_output, buf, 6);
#else
  // writing 8 bytes even though we only care about the first 6 bytes.
  const auto tmp = vec_u64_t(latin1_packed.value);
  memcpy(latin1_output, &tmp[0], 8);
#endif
  latin1_output += 6; // We wrote 6 bytes.
  return consumed;
}
/* end file src/ppc64/ppc64_convert_utf8_to_latin1.cpp */

/* begin file src/ppc64/ppc64_convert_utf8_to_utf32.cpp */
// depends on "tables/utf8_to_utf16_tables.h"

// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  const auto in = vector_u8::load(input);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xfff;
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process the data in chunks of 12 bytes.
    in.store_bytes_as_utf32(utf32_output);
    utf32_output += 12; // We wrote 12 32-bit characters.
    return 12;          // We consumed 12 bytes.
  }
  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
    // UTF-32 code units.
#if SIMDUTF_IS_BIG_ENDIAN
    const auto perm = as_vector_u16(in);
#else
    const auto perm = as_vector_u16(in).swap_bytes();
#endif // SIMDUTF_IS_BIG_ENDIAN
    // in = [110aaaaa|10bbbbbb]
    // t0 = [00000000|00bbbbbb]
    const auto t0 = perm & uint16_t(0x007f);

    // t1 = [00110aaa|aabbbbbb]
    const auto t1 = perm.shr<2>();
    const auto composed = select(uint16_t(0x1f00 >> 2), t1, t0);

    const auto composed8 = as_vector_u8(composed);
    composed8.store_words_as_utf32(utf32_output);

    utf32_output += 8; // We wrote 32 bytes, 8 code points.
    return 16;
  }
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units.
#if SIMDUTF_IS_BIG_ENDIAN
    const auto sh =
        vector_u8(-1, 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11);
#else
    const auto sh =
        vector_u8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
#endif // SIMDUTF_IS_BIG_ENDIAN
    const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));

    // in = [1110aaaa|10bbbbbb|10cccccc]

    // t0 = [00000000|00000000|00cccccc]
    const auto t0 = perm & uint32_t(0x0000007f);

    // t2 = [00000000|0000bbbb|bbcccccc]
    const auto t1 = perm.shr<2>();
    const auto t2 = select(uint32_t(0x00003f00 >> 2), t1, t0);

    // t4 = [00000000|aaaabbbb|bbcccccc]
    const auto t3 = perm.shr<4>();
    const auto t4 = select(uint32_t(0x0f0000 >> 4), t3, t2);

    t4.store(utf32_output);
    utf32_output += 4;
    return 12;
  }
  /// We do not have a fast path available, so we fallback.

  const uint8_t idx =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
  if (idx < 64) {
    // SIX (6) input code-code units
    // this is a relatively easy scenario
    // we process SIX (6) input code-code units. The max length in bytes of six
    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
    // processors where pdep/pext is fast, we might be able to use a small
    // lookup table.
    const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
#if SIMDUTF_IS_BIG_ENDIAN
    const auto perm =
        as_vector_u16(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
#else
    const auto perm = as_vector_u16(sh.lookup_32(in, vector_u8::zero()));
#endif // SIMDUTF_IS_BIG_ENDIAN
    const auto ascii = perm & uint16_t(0x7f);
    const auto highbyte = perm & uint16_t(0x1f00);
    const auto composed = ascii | highbyte.shr<2>();

    as_vector_u8(composed).store_words_as_utf32(utf32_output);
    utf32_output += 6; // We wrote 12 bytes, 6 code points.
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
#if SIMDUTF_IS_BIG_ENDIAN
    const auto perm =
        as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
#else
    const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
#endif // SIMDUTF_IS_BIG_ENDIAN
    const auto ascii = perm & uint32_t(0x7f);
    const auto middlebyte = perm & uint32_t(0x3f00);
    const auto middlebyte_shifted = middlebyte.shr<2>();
    const auto highbyte = perm & uint32_t(0x0f0000);
    const auto highbyte_shifted = highbyte.shr<4>();
    const auto composed = ascii | middlebyte_shifted | highbyte_shifted;

    composed.store(utf32_output);
    utf32_output += 4;
  } else if (idx < 209) {
    // TWO (2) input code-code units
    const auto sh = vector_u8::load(&tables::utf8_to_utf16::shufutf8[idx]);
#if SIMDUTF_IS_BIG_ENDIAN
    const auto perm =
        as_vector_u32(sh.lookup_32(in, vector_u8::zero())).swap_bytes();
#else
    const auto perm = as_vector_u32(sh.lookup_32(in, vector_u8::zero()));
#endif // SIMDUTF_IS_BIG_ENDIAN
    const auto ascii = perm & uint32_t(0x0000007f);
    const auto middlebyte = perm & uint32_t(0x3f00);
    const auto middlebyte_shifted = middlebyte.shr<2>();
    auto middlehighbyte = perm & uint32_t(0x003f0000);
    // correct for spurious high bit
    const auto correct0 = perm & uint32_t(0x00400000);
    const auto correct = correct0.shr<1>();
    middlehighbyte = correct ^ middlehighbyte;
    const auto middlehighbyte_shifted = middlehighbyte.shr<4>();
    const auto highbyte = perm & uint32_t(0x07000000);
    const auto highbyte_shifted = highbyte.shr<6>();
    const auto composed =
        ascii | middlebyte_shifted | highbyte_shifted | middlehighbyte_shifted;
    composed.store(utf32_output);
    utf32_output += 3;
  } else {
    // here we know that there is an error but we do not handle errors
  }
  return consumed;
}
/* end file src/ppc64/ppc64_convert_utf8_to_utf32.cpp */

#if (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32) && SIMDUTF_FEATURE_UTF8
/* begin file src/ppc64/ppc64_convert_utf16_to_utf8.cpp */
/*
    The vectorized algorithm works on single SSE register i.e., it
    loads eight 16-bit code units.

    We consider three cases:
    1. an input register contains no surrogates and each value
       is in range 0x0000 .. 0x07ff.
    2. an input register contains no surrogates and values are
       is in range 0x0000 .. 0xffff.
    3. an input register contains surrogates --- i.e. codepoints
       can have 16 or 32 bits.

    Ad 1.

    When values are less than 0x0800, it means that a 16-bit code unit
    can be converted into: 1) single UTF8 byte (when it is an ASCII
    char) or 2) two UTF8 bytes.

    For this case we do only some shuffle to obtain these 2-byte
    codes and finally compress the whole SSE register with a single
    shuffle.

    We need 256-entry lookup table to get a compression pattern
    and the number of output bytes in the compressed vector register.
    Each entry occupies 17 bytes.

    Ad 2.

    When values fit in 16-bit code units, but are above 0x07ff, then
    a single word may produce one, two or three UTF8 bytes.

    We prepare data for all these three cases in two registers.
    The first register contains lower two UTF8 bytes (used in all
    cases), while the second one contains just the third byte for
    the three-UTF8-bytes case.

    Finally these two registers are interleaved forming eight-element
    array of 32-bit values. The array spans two SSE registers.
    The bytes from the registers are compressed using two shuffles.

    We need 256-entry lookup table to get a compression pattern
    and the number of output bytes in the compressed vector register.
    Each entry occupies 17 bytes.

    To summarize:
    - We need two 256-entry tables that have 8704 bytes in total.
*/

// Auxiliary procedure used by UTF-16 and UTF-32 into UTF-8.
// Note the pointer is passed by reference, it is updated by the procedure.
template <typename T>
simdutf_really_inline void ppc64_convert_utf16_to_1_2_3_bytes_of_utf8(
    const vector_u16 in, uint16_t one_byte_bitmask,
    const T one_or_two_bytes_bytemask, uint16_t one_or_two_bytes_bitmask,
    char *&utf8_output) {
  // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
#if SIMDUTF_IS_BIG_ENDIAN
  const auto dup_lsb =
      vector_u8(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
#else
  const auto dup_lsb =
      vector_u8(0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
#endif // SIMDUTF_IS_BIG_ENDIAN

  /* In this branch we handle three cases:
     1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
    single UFT-8 byte
     2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
    UTF-8 bytes
     3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
    three UTF-8 bytes

    We expand the input word (16-bit) into two code units (32-bit), thus
    we have room for four bytes. However, we need five distinct bit
    layouts. Note that the last byte in cases #2 and #3 is the same.

    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
    in register t2.

    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
    either byte 1 for case #2 or byte 2 for case #3. Note that they
    differ by exactly one bit.

    Finally from these two code units we build proper UTF-8 sequence, taking
    into account the case (i.e, the number of bytes to write).
  */
  /**
   * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
   * t2 => [0ccc|cccc] [10cc|cccc]
   * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
   */
  // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
  const auto t0 = as_vector_u16(dup_lsb.lookup_16(as_vector_u8(in)));

  // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
  const auto t1 = t0 & uint16_t(0b0011111101111111);
  // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
  const auto t2 = t1 | uint16_t(0b1000000000000000);

  // in = [aaaa|bbbb|bbcc|cccc]
  // a0 = [0000|0000|0000|aaaa]
  const auto a0 = in.shr<12>();
  // b0 = [aabb|bbbb|cccc|cc00]
  const auto b0 = in.shl<2>();
  // s0 = [00bb|bbbb|00cc|cccc]
  const auto s0 = select(uint16_t(0x3f00), b0, a0);

  // s3 = [11bb|bbbb|1110|aaaa]
  const auto s3 = s0 | uint16_t(0b1100000011100000);

  const auto m0 =
      ~as_vector_u16(one_or_two_bytes_bytemask) & uint16_t(0b0100000000000000);
  const auto s4 = s3 ^ m0;

  // 4. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
  const uint16_t mask =
      (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
  if (mask == 0) {
    // We only have three-byte code units. Use fast path.
#if SIMDUTF_IS_BIG_ENDIAN
    // Lookups produced by scripts/ppc64_convert_utf16_to_utf8.py
    const auto shuffle0 =
        vector_u8(1, 0, 16, 3, 2, 18, 5, 4, 20, 7, 6, 22, 9, 8, 24, 11);
    const auto shuffle1 = vector_u8(10, 26, 13, 12, 28, 15, 14, 30, -1, -1, -1,
                                    -1, -1, -1, -1, -1);
#else
    const auto shuffle0 =
        vector_u8(0, 1, 17, 2, 3, 19, 4, 5, 21, 6, 7, 23, 8, 9, 25, 10);
    const auto shuffle1 = vector_u8(11, 27, 12, 13, 29, 14, 15, 31, -1, -1, -1,
                                    -1, -1, -1, -1, -1);
#endif // SIMDUTF_IS_BIG_ENDIAN
    const auto utf8_0 = shuffle0.lookup_32(as_vector_u8(s4), as_vector_u8(t2));
    const auto utf8_1 = shuffle1.lookup_32(as_vector_u8(s4), as_vector_u8(t2));

    utf8_0.store(utf8_output);
    utf8_output += 16;
    utf8_1.store(utf8_output);
    utf8_output += 8;
    return;
  }

  const uint8_t mask0 = uint8_t(mask);

  const uint8_t *row0 =
      &simdutf::tables::ppc64_utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
  const auto shuffle0 = vector_u8::load(row0 + 1);

  const auto utf8_0 = shuffle0.lookup_32(as_vector_u8(s4), as_vector_u8(t2));
  const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);

  const uint8_t *row1 =
      &simdutf::tables::ppc64_utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
  const auto shuffle1 = vector_u8::load(row1 + 1) + uint8_t(8);
  const auto utf8_1 = shuffle1.lookup_32(as_vector_u8(s4), as_vector_u8(t2));

  utf8_0.store(utf8_output);
  utf8_output += row0[0];
  utf8_1.store(utf8_output);
  utf8_output += row1[0];
}

struct utf16_to_utf8_t {
  error_code err;
  const char16_t *input;
  char *output;
};

/*
  Returns utf16_to_utf8_t value
  A scalar routine should carry on the conversion of the tail,
  iff there was no error.
*/
template <endianness big_endian>
utf16_to_utf8_t ppc64_convert_utf16_to_utf8(const char16_t *buf, size_t len,
                                            char *utf8_output) {

  const char16_t *end = buf + len;

  const auto v_f800 = vector_u16(0xf800);
  const auto v_d800 = vector_u16(0xd800);
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
    auto in = vector_u16::load(buf);
    if (not match_system(big_endian)) {
      in = in.swap_bytes();
    }
    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
    if (in.is_ascii()) {
      auto nextin = vector_u16::load(buf + vector_u16::ELEMENTS);
      if (not match_system(big_endian)) {
        nextin = nextin.swap_bytes();
      }

      if (nextin.is_ascii()) {
        // 1. pack the bytes
        const auto utf8_packed = vector_u16::pack(in, nextin);
        // 2. store (16 bytes)
        utf8_packed.store(utf8_output);
        // 3. adjust pointers
        buf += 16;
        utf8_output += 16;
        continue; // we are done for this round!
      }

      // next block is not ASCII
      const auto utf8_packed = vector_u16::pack(in, in);
      // 2. store (16 bytes)
      utf8_packed.store(utf8_output);
      // 3. adjust pointers
      buf += 8;
      utf8_output += 8;
      in = nextin;
      // fallback
    }

    // no bits set above 7th bit
    const auto one_byte_bytemask = in < uint16_t(1 << 7);
    const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask();

    // no bits set above 11th bit
    const auto one_or_two_bytes_bytemask = in < uint16_t(1 << 11);
    const uint16_t one_or_two_bytes_bitmask =
        one_or_two_bytes_bytemask.to_bitmask();

    if (one_or_two_bytes_bitmask == 0xffff) {
      write_v_u16_11bits_to_utf8(
          in, utf8_output, as_vector_u8(one_byte_bytemask), one_byte_bitmask);
      buf += 8;
      continue;
    }

    // 1. Check if there are any surrogate word in the input chunk.
    //    We have also to deal with situation when there is a surrogate word
    //    at the end of a chunk.
    const auto surrogates_bytemask = (in & v_f800) == v_d800;

    // bitmask = 0x0000 if there are no surrogates
    //         = 0xc000 if the last word is a surrogate
    const uint16_t surrogates_bitmask = surrogates_bytemask.to_bitmask();
    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
    // However, it is likely an uncommon occurrence.
    if (surrogates_bitmask == 0x0000) {
      ppc64_convert_utf16_to_1_2_3_bytes_of_utf8(
          in, one_byte_bitmask, one_or_two_bytes_bytemask,
          one_or_two_bytes_bitmask, utf8_output);

      buf += 8;
      // surrogate pair(s) in a register
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint16_t word = scalar::utf16::swap_if_needed<big_endian>(buf[k]);
        if ((word & 0xFF80) == 0) {
          *utf8_output++ = uint8_t(word);
        } else if ((word & 0xF800) == 0) {
          *utf8_output++ = uint8_t((word >> 6) | 0b11000000);
          *utf8_output++ = uint8_t((word & 0b111111) | 0b10000000);
        } else if ((word & 0xF800) != 0xD800) {
          *utf8_output++ = uint8_t((word >> 12) | 0b11100000);
          *utf8_output++ = uint8_t(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = uint8_t((word & 0b111111) | 0b10000000);
        } else {
          // must be a surrogate pair
          uint16_t diff = uint16_t(word - 0xD800);
          uint16_t next_word =
              scalar::utf16::swap_if_needed<big_endian>(buf[k + 1]);
          k++;
          uint16_t diff2 = uint16_t(next_word - 0xDC00);
          if ((diff | diff2) > 0x3FF) {
            return utf16_to_utf8_t{error_code::SURROGATE, buf + k - 1,
                                   utf8_output};
          }
          uint32_t value = (diff << 10) + diff2 + 0x10000;
          *utf8_output++ = uint8_t((value >> 18) | 0b11110000);
          *utf8_output++ = uint8_t(((value >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = uint8_t(((value >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = uint8_t((value & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  return utf16_to_utf8_t{error_code::SUCCESS, buf, utf8_output};
}
/* end file src/ppc64/ppc64_convert_utf16_to_utf8.cpp */
#endif // (SIMDUTF_FEATURE_UTF16 || SIMDUTF_FEATURE_UTF32) &&
       // SIMDUTF_FEATURE_UTF8

/* begin file src/ppc64/ppc64_convert_utf32_to_latin1.cpp */
enum class ErrorChecking { disabled, enabled };

struct utf32_to_latin1_t {
  error_code err;
  const char32_t *input;
  char *output;
};

template <ErrorChecking ec>
utf32_to_latin1_t simdutf_really_inline ppc64_convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) {
  constexpr size_t N = vector_u32::ELEMENTS;
  const size_t rounded_len = align_down<4 * N>(len);

  const auto high_bytes_mask = vector_u32::splat(0xFFFFFF00);

  for (size_t i = 0; i < rounded_len; i += 4 * N) {
    const auto in1 = vector_u32::load(buf + 0 * N);
    const auto in2 = vector_u32::load(buf + 1 * N);
    const auto in3 = vector_u32::load(buf + 2 * N);
    const auto in4 = vector_u32::load(buf + 3 * N);

    if (ec == ErrorChecking::enabled) {
      const auto combined = in1 | in2 | in3 | in4;
      const auto too_big = (combined & high_bytes_mask) != uint32_t(0);

      if (simdutf_unlikely(too_big.any())) {
        // Scalar code will carry on from the beginning of the current block
        // and report the exact error position.
        return utf32_to_latin1_t{error_code::OTHER, buf, latin1_output};
      }
    }

    // Note: element #1 contains 0, and is used to mask-out elements
#if SIMDUTF_IS_BIG_ENDIAN
    const auto shlo = vector_u8(0 + 3, 4 + 3, 8 + 3, 12 + 3, 16 + 3, 20 + 3,
                                24 + 3, 28 + 3, 1, 1, 1, 1, 1, 1, 1, 1);
    const auto shhi = vector_u8(1, 1, 1, 1, 1, 1, 1, 1, 0 + 3, 4 + 3, 8 + 3,
                                12 + 3, 16 + 3, 20 + 3, 24 + 3, 28 + 3);
#else
    const auto shlo =
        vector_u8(0, 4, 8, 12, 16, 20, 24, 28, 1, 1, 1, 1, 1, 1, 1, 1);
    const auto shhi =
        vector_u8(1, 1, 1, 1, 1, 1, 1, 1, 0, 4, 8, 12, 16, 20, 24, 28);
#endif // SIMDUTF_IS_BIG_ENDIAN
    const auto lo = shlo.lookup_32(as_vector_u8(in1), as_vector_u8(in2));
    const auto hi = shhi.lookup_32(as_vector_u8(in3), as_vector_u8(in4));

    const auto merged = lo | hi;

    merged.store(latin1_output);
    latin1_output += 4 * N;
    buf += 4 * N;
  }

  return utf32_to_latin1_t{error_code::SUCCESS, buf, latin1_output};
}
/* end file src/ppc64/ppc64_convert_utf32_to_latin1.cpp */

/* begin file src/ppc64/ppc64_convert_utf32_to_utf8.cpp */
struct utf32_to_utf8_t {
  error_code err;
  const char32_t *input;
  char *output;
};

template <ErrorReporting er>
utf32_to_utf8_t ppc64_convert_utf32_to_utf8(const char32_t *buf, size_t len,
                                            char *utf8_output) {
  const char32_t *end = buf + len;

  const auto v_f800 = vector_u16::splat(0xf800);
  const auto v_d800 = vector_u16::splat(0xd800);

  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto v_00000000 = vector_u32::zero();
  auto forbidden_bytemask = simd16<bool>();
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >=
         std::ptrdiff_t(
             16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
                                    // has 4 bytes or 32 bits, thus buf + 16 *
                                    // char_32t = 512 bits = 64 bytes
    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
    // These two values can hold only 8 UTF32 chars
    auto in0 = vector_u32::load(buf);
    auto in1 = vector_u32::load(buf + vector_u32::ELEMENTS);

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    auto in = vector_u32::pack(in0, in1);

    // Try to apply UTF-16 => UTF-8 from ./ppc64_convert_utf16_to_utf8.cpp

    // Check for ASCII fast path

    // ASCII fast path!!!!
    // We eagerly load another 32 bytes, hoping that they will be ASCII too.
    // The intuition is that we try to collect 16 ASCII characters which
    // requires a total of 64 bytes of input. If we fail, we just pass thirdin
    // and fourthin as our new inputs.
    if (in.is_ascii()) { // if the first two blocks are ASCII
      const auto in2 = vector_u32::load(buf + 2 * vector_u32::ELEMENTS);
      const auto in3 = vector_u32::load(buf + 3 * vector_u32::ELEMENTS);

      const auto next = vector_u32::pack(in2, in3);
      if (next.is_ascii()) {
        // 1. pack the bytes
        const auto utf8_packed = vector_u16::pack(in, next);
        // 2. store (16 bytes)
        utf8_packed.store(utf8_output);
        // 3. adjust pointers
        buf += 16;
        utf8_output += 16;
        continue; // we are done for this round!
      }

      // `next` is not ASCII, write `in` and carry on with next

      // 1. pack the bytes
      const auto utf8_packed = vector_u16::pack(in, in);
      utf8_packed.store(utf8_output);
      // 3. adjust pointers
      buf += 8;
      utf8_output += 8;

      // Proceed with next input
      in = next;
      in0 = in2;
      in1 = in3;
    }

    // no bits set above 7th bit
    const auto one_byte_bytemask = in < uint16_t(1 << 7);
    const uint16_t one_byte_bitmask = one_byte_bytemask.to_bitmask();

    // no bits set above 11th bit
    const auto one_or_two_bytes_bytemask = in < uint16_t(1 << 11);
    const uint16_t one_or_two_bytes_bitmask =
        one_or_two_bytes_bytemask.to_bitmask();

    if (one_or_two_bytes_bitmask == 0xffff) {
      write_v_u16_11bits_to_utf8(
          in, utf8_output, as_vector_u8(one_byte_bytemask), one_byte_bitmask);
      buf += 8;
      continue;
    }

    // Check for overflow in packing
    const auto saturation_bytemask = ((in0 | in1) & v_ffff0000) == v_00000000;
    const uint16_t saturation_bitmask = saturation_bytemask.to_bitmask();
    if (saturation_bitmask == 0xffff) {
      switch (er) {
      case ErrorReporting::precise: {
        const auto forbidden = (in & v_f800) == v_d800;
        if (forbidden.any()) {
          // We return no error code, instead we force the scalar procedure
          // to rescan the portion of input where we've just found an error.
          return utf32_to_utf8_t{error_code::SUCCESS, buf, utf8_output};
        }
      } break;
      case ErrorReporting::at_the_end:
        forbidden_bytemask |= (in & v_f800) == v_d800;
        break;
      case ErrorReporting::none:
        break;
      }

      ppc64_convert_utf16_to_1_2_3_bytes_of_utf8(
          in, one_byte_bitmask, one_or_two_bytes_bytemask,
          one_or_two_bytes_bitmask, utf8_output);
      buf += 8;
    } else {
      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD in the
      // presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (er != ErrorReporting::none and
              (word >= 0xD800 && word <= 0xDFFF)) {
            return utf32_to_utf8_t{error_code::SURROGATE, buf + k, utf8_output};
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (er != ErrorReporting::none and (word > 0x10FFFF)) {
            return utf32_to_utf8_t{error_code::TOO_LARGE, buf + k, utf8_output};
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  if (er == ErrorReporting::at_the_end) {
    if (forbidden_bytemask.any()) {
      return utf32_to_utf8_t{error_code::SURROGATE, buf, utf8_output};
    }
  }

  return utf32_to_utf8_t{
      error_code::SUCCESS,
      buf,
      utf8_output,
  };
}
/* end file src/ppc64/ppc64_convert_utf32_to_utf8.cpp */

/* begin file src/ppc64/ppc64_utf8_length_from_latin1.cpp */
template <typename T> T min(T a, T b) { return a <= b ? a : b; }

std::pair<const char *, size_t> ppc64_utf8_length_from_latin1(const char *input,
                                                              size_t length) {
  constexpr size_t N = vector_u8::ELEMENTS;
  length = (length / N);

  size_t count = length * N;
  while (length != 0) {
    vector_u32 partial = vector_u32::zero();

    // partial accumulator has 32 bits => this yields (2^31 / 16)
    size_t chunk = min(length, size_t(0xffffffff / N));
    length -= chunk;
    while (chunk != 0) {
      auto local = vector_u8::zero();
      // local accumulator has 8 bits => this yields 255 max (we increment by 1
      // in each iteration)
      const size_t n = min(chunk, size_t(255));
      chunk -= n;
      for (size_t i = 0; i < n; i++) {
        const auto in = vector_i8::load(input);
        input += N;

        local -= as_vector_u8(in < vector_i8::splat(0));
      }

      partial = sum4bytes(local, partial);
    }

    for (int i = 0; i < vector_u32::ELEMENTS; i++) {
      count += size_t(partial.value[i]);
    }
  }

  return std::make_pair(input, count);
}
/* end file src/ppc64/ppc64_utf8_length_from_latin1.cpp */

/* begin file src/ppc64/ppc64_base64.cpp */
/*
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 *
 * AMD XOP specific: http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
 * Altivec has capabilities of AMD XOP (or vice versa): shuffle using 2 vectors
 * and variable shifts, thus this implementation shares some code solution
 * (modulo intrinsic function names).
 */

constexpr bool with_base64_std = false;
constexpr bool with_base64_url = true;
constexpr bool with_ignore_errors = true;
constexpr bool with_ignore_garbage = true;
constexpr bool with_strict_checking = false;

// --- encoding -----------------------------------------------

/*
    Procedure translates vector of bytes having 6-bit values
    into ASCII counterparts.
*/
template <bool base64_url>
vector_u8 encoding_translate_6bit_values(const vector_u8 input) {
  // credit: Wojciech Muła
  // reduce  0..51 -> 0
  //        52..61 -> 1 .. 10
  //            62 -> 11
  //            63 -> 12
  auto result = input.saturating_sub(vector_u8::splat(51));

  // distinguish between ranges 0..25 and 26..51:
  //         0 .. 25 -> remains 13
  //        26 .. 51 -> becomes 0
  const auto lt = input < vector_u8::splat(26);
  result = select(as_vector_u8(lt), vector_u8::splat(13), result);

  const auto shift_LUT =
      base64_url ? vector_u8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                             '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                             '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0)
                 : vector_u8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                             '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                             '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
  // read shift
  result = result.lookup_16(shift_LUT);

  return input + result;
}

/*
   Procedure expands 12 bytes (4*3 bytes) into 16 bytes,
   each byte stores 6 bits of data
*/
template <typename = void>
simdutf_really_inline vector_u8 encoding_expand_6bit_fields(vector_u8 input) {
#if SIMDUTF_IS_BIG_ENDIAN
  #define indices4(dx) (dx + 0), (dx + 1), (dx + 1), (dx + 2)
  const auto expand_3_to_4 = vector_u8(indices4(0 * 3), indices4(1 * 3),
                                       indices4(2 * 3), indices4(3 * 3));
  #undef indices4

  // input = [........|ccdddddd|bbbbcccc|aaaaaabb] as uint8_t
  //              3        2        1        0
  //
  // in'   = [aaaaaabb|bbbbcccc|bbbbcccc|ccdddddd] as uint32_t
  //              0        1        1        2
  const auto in = as_vector_u32(expand_3_to_4.lookup_16(input));

  // t0    = [00000000|00000000|00000000|00dddddd]
  const auto t0 = in & uint32_t(0x0000003f);

  // t1    = [00000000|00000000|00cccccc|00dddddd]
  const auto t1 = select(uint32_t(0x00003f00), in.shl<2>(), t0);

  // t2    = [00000000|00bbbbbb|00cccccc|00dddddd]
  const auto t2 = select(uint32_t(0x003f0000), in.shr<4>(), t1);

  // t3    = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
  const auto t3 = select(uint32_t(0x3f000000), in.shr<2>(), t2);

  return as_vector_u8(t3);
#else
  #define indices4(dx) (dx + 1), (dx + 0), (dx + 2), (dx + 1)
  const auto expand_3_to_4 = vector_u8(indices4(0 * 3), indices4(1 * 3),
                                       indices4(2 * 3), indices4(3 * 3));
  #undef indices4

  // input = [........|ccdddddd|bbbbcccc|aaaaaabb] as uint8_t
  //              3        2        1        0
  //
  // in'   = [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] as uint32_t
  //              1        2        0        1
  const auto in = as_vector_u32(expand_3_to_4.lookup_16(input));

  // t0    = [00dddddd|00000000|00000000|00000000]
  const auto t0 = in.shl<8>() & uint32_t(0x3f000000);

  // t1    = [00dddddd|00cccccc|00000000|00000000]
  const auto t1 = select(uint32_t(0x003f0000), in.shr<6>(), t0);

  // t2    = [00dddddd|00cccccc|00bbbbbb|00000000]
  const auto t2 = select(uint32_t(0x00003f00), in.shl<4>(), t1);

  // t3    = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
  const auto t3 = select(uint32_t(0x0000003f), in.shr<10>(), t2);

  return as_vector_u8(t3);
#endif // SIMDUTF_IS_BIG_ENDIAN
}

template <bool isbase64url>
size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {

  const uint8_t *input = (const uint8_t *)src;

  uint8_t *out = (uint8_t *)dst;

  size_t i = 0;
  for (; i + 52 <= srclen; i += 48) {
    const auto in0 = vector_u8::load(input + i + 12 * 0);
    const auto in1 = vector_u8::load(input + i + 12 * 1);
    const auto in2 = vector_u8::load(input + i + 12 * 2);
    const auto in3 = vector_u8::load(input + i + 12 * 3);

    const auto expanded0 = encoding_expand_6bit_fields(in0);
    const auto expanded1 = encoding_expand_6bit_fields(in1);
    const auto expanded2 = encoding_expand_6bit_fields(in2);
    const auto expanded3 = encoding_expand_6bit_fields(in3);

    const auto base64_0 =
        encoding_translate_6bit_values<isbase64url>(expanded0);
    const auto base64_1 =
        encoding_translate_6bit_values<isbase64url>(expanded1);
    const auto base64_2 =
        encoding_translate_6bit_values<isbase64url>(expanded2);
    const auto base64_3 =
        encoding_translate_6bit_values<isbase64url>(expanded3);

    base64_0.store(out);
    out += 16;

    base64_1.store(out);
    out += 16;

    base64_2.store(out);
    out += 16;

    base64_3.store(out);
    out += 16;
  }
  for (; i + 16 <= srclen; i += 12) {
    const auto in = vector_u8::load(input + i);
    const auto expanded = encoding_expand_6bit_fields(in);
    const auto base64 = encoding_translate_6bit_values<isbase64url>(expanded);

    base64.store(out);
    out += 16;
  }

  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                        srclen - i, options);
}

// --- decoding -----------------------------------------------

static simdutf_really_inline void compress(const vector_u8 data, uint16_t mask,
                                           char *output) {
  if (mask == 0) {
    data.store(output);
    return;
  }

  // this particular implementation was inspired by work done by @animetosho
  // we do it in two steps, first 8 bytes and then second 8 bytes
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
  // next line just loads the 64-bit values thintable_epi8[mask1] and
  // thintable_epi8[mask2] into a 128-bit register, using only
  // two instructions on most compilers.

#if SIMDUTF_IS_BIG_ENDIAN
  vec_u64_t tmp = {
      tables::base64::thintable_epi8[mask2],
      tables::base64::thintable_epi8[mask1],
  };

  auto shufmask = vector_u8(vec_reve(vec_u8_t(tmp)));

  // we increment by 0x08 the second half of the mask
  shufmask =
      shufmask + vector_u8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
#else
  vec_u64_t tmp = {
      tables::base64::thintable_epi8[mask1],
      tables::base64::thintable_epi8[mask2],
  };

  auto shufmask = vector_u8(vec_u8_t(tmp));

  // we increment by 0x08 the second half of the mask
  shufmask =
      shufmask + vector_u8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
#endif // SIMDUTF_IS_BIG_ENDIAN

  // this is the version "nearly pruned"
  const auto pruned = shufmask.lookup_16(data);
  // we still need to put the two halves together.
  // we compute the popcount of the first half:
  const int pop1 = tables::base64::BitsSetTable256mul2[mask1];
  // then load the corresponding mask, what it does is to write
  // only the first pop1 bytes from the first 8 bytes, and then
  // it fills in with the bytes from the second 8 bytes + some filling
  // at the end.
  const auto compactmask =
      vector_u8::load(tables::base64::pshufb_combine_table + pop1 * 8);

  const auto answer = compactmask.lookup_16(pruned);

  answer.store(output);
}

static simdutf_really_inline vector_u8 decoding_pack(vector_u8 input) {
#if SIMDUTF_IS_BIG_ENDIAN
  // in   = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
  // want = [00000000|aaaaaabb|bbbbcccc|ccdddddd]

  auto in = as_vector_u16(input);
  // t0   = [00??aaaa|aabbbbbb|00??cccc|ccdddddd]
  const auto t0 = in.shr<2>();
  const auto t1 = select(uint16_t(0x0fc0), t0, in);

  // t0   = [00??????|aaaaaabb|bbbbcccc|ccdddddd]
  const auto t2 = as_vector_u32(t1);
  const auto t3 = t2.shr<4>();
  const auto t4 = select(uint32_t(0x00fff000), t3, t2);

  const auto tmp = as_vector_u8(t4);

  const auto shuffle =
      vector_u8(1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 0, 0, 0, 0);

  const auto t = shuffle.lookup_16(tmp);

  return t;
#else
  // in   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
  // want = [00000000|aaaaaabb|bbbbcccc|ccdddddd]

  auto u = as_vector_u32(input).swap_bytes();

  auto in = vector_u16((vec_u16_t)u.value);
  // t0   = [00??aaaa|aabbbbbb|00??cccc|ccdddddd]
  const auto t0 = in.shr<2>();
  const auto t1 = select(uint16_t(0x0fc0), t0, in);

  // t0   = [00??????|aaaaaabb|bbbbcccc|ccdddddd]
  const auto t2 = as_vector_u32(t1);
  const auto t3 = t2.shr<4>();
  const auto t4 = select(uint32_t(0x00fff000), t3, t2);

  const auto tmp = as_vector_u8(t4);

  const auto shuffle =
      vector_u8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0, 0, 0, 0);

  const auto t = shuffle.lookup_16(tmp);

  return t;
#endif // SIMDUTF_IS_BIG_ENDIAN
}
static simdutf_really_inline void base64_decode(char *out, vector_u8 input) {
  const auto expanded = decoding_pack(input);
  expanded.store(out);
}

static simdutf_really_inline void base64_decode_block(char *out,
                                                      const char *src) {
  base64_decode(out + 12 * 0, vector_u8::load(src + 0 * 16));
  base64_decode(out + 12 * 1, vector_u8::load(src + 1 * 16));
  base64_decode(out + 12 * 2, vector_u8::load(src + 2 * 16));
  base64_decode(out + 12 * 3, vector_u8::load(src + 3 * 16));
}

static simdutf_really_inline void base64_decode_block_safe(char *out,
                                                           const char *src) {
  base64_decode(out + 12 * 0, vector_u8::load(src + 0 * 16));
  base64_decode(out + 12 * 1, vector_u8::load(src + 1 * 16));
  base64_decode(out + 12 * 2, vector_u8::load(src + 2 * 16));

  char buffer[16];
  base64_decode(buffer, vector_u8::load(src + 3 * 16));
  std::memcpy(out + 36, buffer, 12);
}

// ---base64 decoding::block64 class --------------------------

class block64 {
  simd8x64<uint8_t> b;

public:
  simdutf_really_inline block64(const char *src) : b(load_block(src)) {}
  simdutf_really_inline block64(const char16_t *src) : b(load_block(src)) {}

private:
  // The caller of this function is responsible to ensure that there are 64
  // bytes available from reading at src. The data is read into a block64
  // structure.
  static simdutf_really_inline simd8x64<uint8_t> load_block(const char *src) {
    const auto v0 = vector_u8::load(src + 16 * 0);
    const auto v1 = vector_u8::load(src + 16 * 1);
    const auto v2 = vector_u8::load(src + 16 * 2);
    const auto v3 = vector_u8::load(src + 16 * 3);

    return simd8x64<uint8_t>(v0, v1, v2, v3);
  }

  // The caller of this function is responsible to ensure that there are 128
  // bytes available from reading at src. The data is read into a block64
  // structure.
  static simdutf_really_inline simd8x64<uint8_t>
  load_block(const char16_t *src) {
    const auto m1 = vector_u16::load(src + 8 * 0);
    const auto m2 = vector_u16::load(src + 8 * 1);
    const auto m3 = vector_u16::load(src + 8 * 2);
    const auto m4 = vector_u16::load(src + 8 * 3);
    const auto m5 = vector_u16::load(src + 8 * 4);
    const auto m6 = vector_u16::load(src + 8 * 5);
    const auto m7 = vector_u16::load(src + 8 * 6);
    const auto m8 = vector_u16::load(src + 8 * 7);

    return simd8x64<uint8_t>(vector_u16::pack(m1, m2), vector_u16::pack(m3, m4),
                             vector_u16::pack(m5, m6),
                             vector_u16::pack(m7, m8));
  }

public:
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
  static inline uint16_t to_base64_mask(vector_u8 &src, uint16_t &error) {
    const auto ascii_space_tbl =
        vector_u8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
                  0xc, 0xd, 0x0, 0x0);

    // credit: aqrit
    const auto delta_asso =
        default_or_url
            ? vector_u8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00,
                        0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16)
            : vector_u8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00,
                        0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);

    const auto delta_values =
        default_or_url
            ? vector_u8(0xBF, 0xE0, 0xB9, 0x13, 0x04, 0xBF, 0xBF, 0xB9, 0xB9,
                        0x00, 0xFF, 0x11, 0xFF, 0xBF, 0x10, 0xB9)
            : (base64_url
                   ? vector_u8(0x0, 0x0, 0x0, 0x13, 0x4, 0xBF, 0xBF, 0xB9, 0xB9,
                               0x0, 0x11, 0xC3, 0xBF, 0xE0, 0xB9, 0xB9)
                   : vector_u8(0x00, 0x00, 0x00, 0x13, 0x04, 0xBF, 0xBF, 0xB9,
                               0xB9, 0x00, 0x10, 0xC3, 0xBF, 0xBF, 0xB9, 0xB9));

    const auto check_asso =
        default_or_url
            ? vector_u8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                        0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06)
            : (base64_url
                   ? vector_u8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
                               0x3, 0x7, 0xB, 0xE, 0xB, 0x6)
                   : vector_u8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F));

    const auto check_values =
        default_or_url
            ? vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xD5, 0xA6, 0xB5,
                        0xA1, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80)
            : (base64_url
                   ? vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xB6, 0xA6,
                               0xB5, 0xA1, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80)
                   : vector_u8(0x80, 0x80, 0x80, 0x80, 0xCF, 0xBF, 0xD5, 0xA6,
                               0xB5, 0x86, 0xD1, 0x80, 0xB1, 0x80, 0x91, 0x80));

    const auto shifted = src.shr<3>();

    const auto delta_hash = avg(src.lookup_16(delta_asso), shifted);
    const auto check_hash = avg(src.lookup_16(check_asso), shifted);

    const auto out = as_vector_i8(delta_hash.lookup_16(delta_values))
                         .saturating_add(as_vector_i8(src));
    const auto chk = as_vector_i8(check_hash.lookup_16(check_values))
                         .saturating_add(as_vector_i8(src));

    const uint16_t mask = chk.to_bitmask();
    if (!ignore_garbage && mask) {
      const auto ascii = src.lookup_16(ascii_space_tbl);
      const auto ascii_space = (ascii == src);
      error = (mask ^ ascii_space.to_bitmask());
    }
    src = out;

    return mask;
  }

  template <bool base64_url, bool ignore_garbage, bool default_or_url>
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
    uint16_t err0 = 0;
    uint16_t err1 = 0;
    uint16_t err2 = 0;
    uint16_t err3 = 0;
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        b.chunks[0], err0);
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        b.chunks[1], err1);
    uint64_t m2 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        b.chunks[2], err2);
    uint64_t m3 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        b.chunks[3], err3);

    if (!ignore_garbage) {
      *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
               ((uint64_t)err3 << 48);
    }
    return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
  }

  simdutf_really_inline void copy_block(char *output) {
    b.store(reinterpret_cast<uint8_t *>(output));
  }

  simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
    uint64_t nmask = ~mask;
    compress(b.chunks[0], uint16_t(mask), output);
    compress(b.chunks[1], uint16_t(mask >> 16),
             output + count_ones(nmask & 0xFFFF));
    compress(b.chunks[2], uint16_t(mask >> 32),
             output + count_ones(nmask & 0xFFFFFFFF));
    compress(b.chunks[3], uint16_t(mask >> 48),
             output + count_ones(nmask & 0xFFFFFFFFFFFFULL));
    return count_ones(nmask);
  }

  simdutf_really_inline void base64_decode_block(char *out) {
    base64_decode(out + 12 * 0, b.chunks[0]);
    base64_decode(out + 12 * 1, b.chunks[1]);
    base64_decode(out + 12 * 2, b.chunks[2]);
    base64_decode(out + 12 * 3, b.chunks[3]);
  }

  simdutf_really_inline void base64_decode_block_safe(char *out) {
    base64_decode(out + 12 * 0, b.chunks[0]);
    base64_decode(out + 12 * 1, b.chunks[1]);
    base64_decode(out + 12 * 2, b.chunks[2]);
    char buffer[16];
    base64_decode(buffer, b.chunks[3]);
    std::memcpy(out + 12 * 3, buffer, 12);
  }
};
/* end file src/ppc64/ppc64_base64.cpp */

} // unnamed namespace
} // namespace ppc64
} // namespace simdutf

/* begin file src/generic/buf_block_reader.h */
namespace simdutf {
namespace ppc64 {
namespace {

// Walks through a buffer in block-sized increments, loading the last part with
// spaces
template <size_t STEP_SIZE> struct buf_block_reader {
public:
  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
  simdutf_really_inline size_t block_index();
  simdutf_really_inline bool has_full_block() const;
  simdutf_really_inline const uint8_t *full_block() const;
  /**
   * Get the last block, padded with spaces.
   *
   * There will always be a last block, with at least 1 byte, unless len == 0
   * (in which case this function fills the buffer with spaces and returns 0. In
   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
   * block with STEP_SIZE bytes and no spaces for padding.
   *
   * @return the number of effective characters in the last block.
   */
  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
  simdutf_really_inline void advance();

private:
  const uint8_t *buf;
  const size_t len;
  const size_t lenminusstep;
  size_t idx;
};

template <size_t STEP_SIZE>
simdutf_really_inline
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
      idx{0} {}

template <size_t STEP_SIZE>
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
  return idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
  return idx < lenminusstep;
}

template <size_t STEP_SIZE>
simdutf_really_inline const uint8_t *
buf_block_reader<STEP_SIZE>::full_block() const {
  return &buf[idx];
}

template <size_t STEP_SIZE>
simdutf_really_inline size_t
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
  if (len == idx) {
    return 0;
  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
  std::memset(dst, 0x20,
              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
                          // to write out 8 or 16 bytes at once.
  std::memcpy(dst, buf + idx, len - idx);
  return len - idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
  idx += STEP_SIZE;
}

} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/buf_block_reader.h */
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8_validation {

using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  static const uint8_t max_array[32] = {255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        0b11110000u - 1,
                                        0b11100000u - 1,
                                        0b11000000u - 1};
  const simd8<uint8_t> max_value(
      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
  return input.gt_bits(max_value);
}

struct utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
  // The last input we received
  simd8<uint8_t> prev_input_block;
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  simd8<uint8_t> prev_incomplete;

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error |= this->prev_incomplete;
  }

  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
    if (simdutf_likely(is_ascii(input))) {
      this->error |= this->prev_incomplete;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
                    "We support either two or four chunks per 64-byte block.");
      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
      }
      this->prev_incomplete =
          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
    }
  }

  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_validation

using utf8_validation::utf8_checker;

} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
/* begin file src/generic/utf8_validation/utf8_validator.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8_validation {

/**
 * Validates that the string is actual UTF-8.
 */
template <class checker>
bool generic_validate_utf8(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  return !c.errors();
}

bool generic_validate_utf8(const char *input, size_t length) {
  return generic_validate_utf8<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

/**
 * Validates that the string is actual UTF-8 and stops on errors.
 */
template <class checker>
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    if (c.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(input),
          reinterpret_cast<const char *>(input + count), length - count);
      res.count += count;
      return res;
    }
    reader.advance();
    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  if (c.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(input),
        reinterpret_cast<const char *>(input) + count, length - count);
    res.count += count;
    return res;
  } else {
    return result(error_code::SUCCESS, length);
  }
}

result generic_validate_utf8_with_errors(const char *input, size_t length) {
  return generic_validate_utf8_with_errors<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

} // namespace utf8_validation
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_validator.h */

/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8_to_utf32 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (utf8_continuation_mask & 1) {
          return 0; // we have an error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
      if (howmany == 0) {
        return 0;
      }
      utf32_output += howmany;
    }
    return utf32_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (errors() || (utf8_continuation_mask & 1)) {
          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, utf32_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        utf32_output += res.count;
      }
    }
    return result(error_code::SUCCESS, utf32_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8_to_utf32 {

using namespace simd;

simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char32_t *utf32_output) noexcept {
  size_t pos = 0;
  char32_t *start{utf32_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf32(utf32_output);
      utf32_output += 64;
      pos += 64;
    } else {
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      size_t max_starting_point = (pos + 64) - 12;
      while (pos < max_starting_point) {
        size_t consumed = convert_masked_utf8_to_utf32(
            input + pos, utf8_end_of_code_point_mask, utf32_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
    }
  }
  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
                                                       utf32_output);
  return utf32_output - start;
}

} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */

/* begin file src/generic/utf8.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8 {

using namespace simd;

simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.gt(-65);
    count += count_ones(utf8_continuation_mask);
  }
  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
                                                        size_t size) {
  using vector_i8 = simd8<int8_t>;
  using vector_u8 = simd8<uint8_t>;
  using vector_u64 = simd64<uint64_t>;

  constexpr size_t N = vector_i8::SIZE;
  constexpr size_t max_iterations = 255 / 4;

  size_t pos = 0;
  size_t count = 0;

  auto counters = vector_u64::zero();
  auto local = vector_u8::zero();
  size_t iterations = 0;
  for (; pos + 4 * N <= size; pos += 4 * N) {
    const auto input0 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
    const auto input1 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
    const auto input2 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
    const auto input3 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
    const auto mask0 = input0 > int8_t(-65);
    const auto mask1 = input1 > int8_t(-65);
    const auto mask2 = input2 > int8_t(-65);
    const auto mask3 = input3 > int8_t(-65);

    local -= vector_u8(mask0);
    local -= vector_u8(mask1);
    local -= vector_u8(mask2);
    local -= vector_u8(mask3);

    iterations += 1;
    if (iterations == max_iterations) {
      counters += sum_8bytes(local);
      local = vector_u8::zero();
      iterations = 0;
    }
  }

  if (iterations > 0) {
    count += local.sum_bytes();
  }

  count += counters.sum();

  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
    // We count one word for anything that is not a continuation (so
    // leading bytes).
    count += 64 - count_ones(utf8_continuation_mask);
    int64_t utf8_4byte = input.gteq_unsigned(240);
    count += count_ones(utf8_4byte);
  }
  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
}

} // namespace utf8
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf8.h */

/* begin file src/generic/utf32.h */
#include <limits>

namespace simdutf {
namespace ppc64 {
namespace {
namespace utf32 {

template <typename T> T min(T a, T b) { return a <= b ? a : b; }

simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
                                                    size_t length) {
  using vector_u32 = simd32<uint32_t>;

  const char32_t *start = input;

  // we add up to three ones in a single iteration (see the vectorized loop in
  // section #2 below)
  const size_t max_increment = 3;

  const size_t N = vector_u32::ELEMENTS;

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
  const auto v_0000007f = vector_u32::splat(0x0000007f);
  const auto v_000007ff = vector_u32::splat(0x000007ff);
  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
#else
  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
  const auto v_fffff800 = vector_u32::splat(0xfffff800);
  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto one = vector_u32::splat(1);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

  size_t counter = 0;

  // 1. vectorized loop unrolled 4 times
  {
    // we use vector of uint32 counters, this is why this limit is used
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
    size_t blocks = length / (N * 4);
    length -= blocks * (N * 4);
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      simd32<uint32_t> acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in0 = vector_u32(input + 0 * N);
        const auto in1 = vector_u32(input + 1 * N);
        const auto in2 = vector_u32(input + 2 * N);
        const auto in3 = vector_u32(input + 3 * N);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in0 > v_0000007f);
        acc -= as_vector_u32(in1 > v_0000007f);
        acc -= as_vector_u32(in2 > v_0000007f);
        acc -= as_vector_u32(in3 > v_0000007f);

        acc -= as_vector_u32(in0 > v_000007ff);
        acc -= as_vector_u32(in1 > v_000007ff);
        acc -= as_vector_u32(in2 > v_000007ff);
        acc -= as_vector_u32(in3 > v_000007ff);

        acc -= as_vector_u32(in0 > v_0000ffff);
        acc -= as_vector_u32(in1 > v_0000ffff);
        acc -= as_vector_u32(in2 > v_0000ffff);
        acc -= as_vector_u32(in3 > v_0000ffff);
#else
        acc += min(one, in0 & v_ffffff80);
        acc += min(one, in1 & v_ffffff80);
        acc += min(one, in2 & v_ffffff80);
        acc += min(one, in3 & v_ffffff80);

        acc += min(one, in0 & v_fffff800);
        acc += min(one, in1 & v_fffff800);
        acc += min(one, in2 & v_fffff800);
        acc += min(one, in3 & v_fffff800);

        acc += min(one, in0 & v_ffff0000);
        acc += min(one, in1 & v_ffff0000);
        acc += min(one, in2 & v_ffff0000);
        acc += min(one, in3 & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += 4 * N;
      }

      counter += acc.sum();
    }
  }

  // 2. vectorized loop for tail
  {
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / max_increment;
    size_t blocks = length / N;
    length -= blocks * N;
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      auto acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in = vector_u32(input);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in > v_0000007f);
        acc -= as_vector_u32(in > v_000007ff);
        acc -= as_vector_u32(in > v_0000ffff);
#else
        acc += min(one, in & v_ffffff80);
        acc += min(one, in & v_fffff800);
        acc += min(one, in & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += N;
      }

      counter += acc.sum();
    }
  }

  const size_t consumed = input - start;
  if (consumed != 0) {
    // We don't count 0th bytes in the vectorized loops above, this
    // is why we need to count them in the end.
    counter += consumed;
  }

  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
}

} // namespace utf32
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf32.h */
/* begin file src/generic/validate_utf32.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf32 {

simdutf_really_inline bool validate(const char32_t *input, size_t size) {
  if (simdutf_unlikely(size == 0)) {
    // empty input is valid UTF-32. protect the implementation from
    // handling nullptr
    return true;
  }

  const char32_t *end = input + size;

  using vector_u32 = simd32<uint32_t>;

  const auto standardmax = vector_u32::splat(0x10ffff);
  const auto offset = vector_u32::splat(0xffff2000);
  const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
  auto currentmax = vector_u32::zero();
  auto currentoffsetmax = vector_u32::zero();

  constexpr size_t N = vector_u32::ELEMENTS;

  while (input + N < end) {
    auto in = vector_u32(input);
    if constexpr (!match_system(endianness::BIG)) {
      in.swap_bytes();
    }

    currentmax = max(currentmax, in);
    currentoffsetmax = max(currentoffsetmax, in + offset);
    input += N;
  }

  const auto too_large = currentmax > standardmax;
  if (too_large.any()) {
    return false;
  }

  const auto surrogate = currentoffsetmax > standardoffsetmax;
  if (surrogate.any()) {
    return false;
  }

  return scalar::utf32::validate(input, end - input);
}

simdutf_really_inline result validate_with_errors(const char32_t *input,
                                                  size_t size) {
  if (simdutf_unlikely(size == 0)) {
    // empty input is valid UTF-32. protect the implementation from
    // handling nullptr
    return result(error_code::SUCCESS, 0);
  }

  const char32_t *start = input;
  const char32_t *end = input + size;

  using vector_u32 = simd32<uint32_t>;

  const auto standardmax = vector_u32::splat(0x10ffff + 1);
  const auto surrogate_mask = vector_u32::splat(0xfffff800);
  const auto surrogate_byte = vector_u32::splat(0x0000d800);

  constexpr size_t N = vector_u32::ELEMENTS;

  while (input + N < end) {
    auto in = vector_u32(input);
    if constexpr (!match_system(endianness::BIG)) {
      in.swap_bytes();
    }

    const auto too_large = in >= standardmax;
    const auto surrogate = (in & surrogate_mask) == surrogate_byte;

    const auto combined = too_large | surrogate;
    if (simdutf_unlikely(combined.any())) {
      const size_t consumed = input - start;
      auto sr = scalar::utf32::validate_with_errors(input, end - input);
      sr.count += consumed;

      return sr;
    }

    input += N;
  }

  const size_t consumed = input - start;
  auto sr = scalar::utf32::validate_with_errors(input, end - input);
  sr.count += consumed;

  return sr;
}

} // namespace utf32
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/validate_utf32.h */

/* begin file src/generic/ascii_validation.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace ascii_validation {

result generic_validate_ascii_with_errors(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      result res = scalar::ascii::validate_with_errors(
          reinterpret_cast<const char *>(input + count), length - count);
      return result(res.error, count + res.count);
    }
    reader.advance();

    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  if (!in.is_ascii()) {
    result res = scalar::ascii::validate_with_errors(
        reinterpret_cast<const char *>(input + count), length - count);
    return result(res.error, count + res.count);
  } else {
    return result(error_code::SUCCESS, length);
  }
}

bool generic_validate_ascii(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      return false;
    }
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  return in.is_ascii();
}

} // namespace ascii_validation
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/ascii_validation.h */

/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
  // 0b11000010 and nothing else.
  //
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
  constexpr const uint8_t FORBIDDEN = 0xff;

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      FORBIDDEN,
      // 1110____ ________ <three byte lead in byte 1>
      FORBIDDEN,
      // 1111____ ________ <four+ byte lead in byte 1>
      FORBIDDEN);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              FORBIDDEN,
              // ____0101 ________
              FORBIDDEN,
              // ____011_ ________
              FORBIDDEN, FORBIDDEN,

              // ____1___ ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
              // ____1101 ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    this->error |= check_special_cases(input, prev1);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 16; margin--) {
      leading_byte += (int8_t(in[margin - 1]) >
                       -65); // twos complement of -65 is 1011 1111 ...
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask =
            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                               // this case, we also have ASCII to account for.
        if (utf8_continuation_mask & 1) {
          return 0; // error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
      if (howmany == 0) {
        return 0;
      }
      latin1_output += howmany;
    }
    return latin1_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        if (errors()) {
          // rewind_and_convert_with_errors will seek a potential error from
          // in+pos onward, with the ability to go back up to pos bytes, and
          // read size-pos bytes forward.
          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, latin1_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        latin1_output += res.count;
      }
    }
    return result(error_code::SUCCESS, latin1_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_latin1
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace ppc64
} // namespace simdutf
  // namespace simdutf
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */

/* begin file src/generic/base64.h */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */
namespace simdutf {
namespace ppc64 {
namespace {
namespace base64 {

/*
    The following template function implements API for Base64 decoding.

    An implementation is responsible for providing the `block64` type and
    associated methods that perform actual conversion. Please refer
    to any vectorized implementation to learn the API of these procedures.
*/
template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename chartype>
full_result
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t equalsigns = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && equalsigns > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0, true};
    }
    return {SUCCESS, full_input_length, 0};
  }
  char *end_of_safe_64byte_zone =
      dst == nullptr
          ? nullptr
          : ((srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63
                                        : dst);

  const chartype *const srcinit = src;
  const char *const dstinit = dst;
  const chartype *const srcend = src + srclen;

  constexpr size_t block_size = 6;
  static_assert(block_size >= 2, "block_size must be at least two");
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const chartype *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b(src);
      src += 64;
      uint64_t error = 0;
      const uint64_t badcharmask =
          b.to_base64_mask<base64_url, ignore_garbage, default_or_url>(&error);
      if (!ignore_garbage && error) {
        src -= 64;
        const size_t error_offset = trailing_zeroes(error);
        return {error_code::INVALID_BASE64_CHARACTER,
                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
      }
      if (badcharmask != 0) {
        bufferptr += b.compress_block(badcharmask, bufferptr);
      } else if (bufferptr != buffer) {
        b.copy_block(bufferptr);
        bufferptr += 64;
      } else {
        if (dst >= end_of_safe_64byte_zone) {
          b.base64_decode_block_safe(dst);
        } else {
          b.base64_decode_block(dst);
        }
        dst += 48;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 2); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        if (dst >= end_of_safe_64byte_zone) {
          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
        } else {
          base64_decode_block(dst, buffer + (block_size - 2) * 64);
        }
        dst += 48;
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }

  char *buffer_start = buffer;
  // Optimization note: if this is almost full, then it is worth our
  // time, otherwise, we should just decode directly.
  int last_block = (int)((bufferptr - buffer_start) % 64);
  if (last_block != 0 && srcend - src + last_block >= 64) {

    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
      uint8_t val = to_base64[uint8_t(*src)];
      *bufferptr = char(val);
      if (!ignore_garbage &&
          (!scalar::base64::is_eight_byte(*src) || val > 64)) {
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      bufferptr += (val <= 63);
      src++;
    }
  }

  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    if (dst >= end_of_safe_64byte_zone) {
      base64_decode_block_safe(dst, buffer_start);
    } else {
      base64_decode_block(dst, buffer_start);
    }
    dst += 48;
  }
  if ((bufferptr - buffer_start) % 64 != 0) {
    while (buffer_start + 4 < bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    if (buffer_start + 4 <= bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
    // backtrack
    int leftover = int(bufferptr - buffer_start);
    while (leftover > 0) {
      if (!ignore_garbage) {
        while (to_base64[uint8_t(*(src - 1))] == 64) {
          src--;
        }
      } else {
        while (to_base64[uint8_t(*(src - 1))] >= 64) {
          src--;
        }
      }
      src--;
      leftover--;
    }
  }
  if (src < srcend + equalsigns) {
    full_result r = scalar::base64::base64_tail_decode(
        dst, src, srcend - src, equalsigns, options, last_chunk_options);
    r = scalar::base64::patch_tail_result(
        r, size_t(src - srcinit), size_t(dst - dstinit), equallocation,
        full_input_length, last_chunk_options);
    // When is_partial(last_chunk_options) is true, we must either end with
    // the end of the stream (beyond whitespace) or right after a non-ignorable
    // character or at the very beginning of the stream.
    // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
        r.input_count < full_input_length) {
      // First check if we can extend the input to the end of the stream
      while (r.input_count < full_input_length &&
             base64_ignorable(*(srcinit + r.input_count), options)) {
        r.input_count++;
      }
      // If we are still not at the end of the stream, then we must backtrack
      // to the last non-ignorable character.
      if (r.input_count < full_input_length) {
        while (r.input_count > 0 &&
               base64_ignorable(*(srcinit + r.input_count - 1), options)) {
          r.input_count--;
        }
      }
    }
    return r;
  }
  if (!ignore_garbage && equalsigns > 0) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit),
              true};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}

} // namespace base64
} // unnamed namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/base64.h */
/* begin file src/generic/find.h */
namespace simdutf {
namespace ppc64 {
namespace {
namespace util {

simdutf_really_inline const char *find(const char *start, const char *end,
                                       char character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  // Align the start pointer to 64 bytes
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % 64;
  if (misalignment != 0) {
    size_t adjustment = 64 - misalignment;
    if (size_t(std::distance(start, end)) < adjustment) {
      adjustment = std::distance(start, end);
    }
    for (size_t i = 0; i < adjustment; i++) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for 64-byte aligned data
  for (; std::distance(start, end) >= 64; start += 64) {
    simd8x64<uint8_t> input(reinterpret_cast<const uint8_t *>(start));
    uint64_t matches = input.eq(uint8_t(character));
    if (matches != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(matches);
      return start + index;
    }
  }
  return std::find(start, end, character);
}

simdutf_really_inline const char16_t *
find(const char16_t *start, const char16_t *end, char16_t character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  // Align the start pointer to 64 bytes if misalignment is even
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % 64;
  if (misalignment != 0 && misalignment % 2 == 0) {
    size_t adjustment = (64 - misalignment) / sizeof(char16_t);
    if (size_t(std::distance(start, end)) < adjustment) {
      adjustment = std::distance(start, end);
    }
    for (size_t i = 0; i < adjustment; i++) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for 64-byte aligned data
  for (; std::distance(start, end) >= 32; start += 32) {
    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(start));
    uint64_t matches = input.eq(uint16_t(character));
    if (matches != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(matches) / 2;
      return start + index;
    }
  }
  return std::find(start, end, character);
}

} // namespace util
} // namespace
} // namespace ppc64
} // namespace simdutf
/* end file src/generic/find.h */

/* begin file src/ppc64/templates.cpp */
/*
    Template `convert_impl` implements generic conversion routine between
    different encodings. Procedure returns the number of written elements,
    or zero in the case of error.

    Parameters:
    * VectorizedConvert - vectorized procedure that returns structure having
      three fields: error_code (err), const Source* (input), Destination*
   (output)
    * ScalarConvert - scalar procedure that carries on conversion of tail
    * Source - type of input char (like char16_t, char)
    * Destination - type of input char
*/
template <typename VectorizedConvert, typename ScalarConvert, typename Source,
          typename Destination>
size_t convert_impl(VectorizedConvert vectorized_convert,
                    ScalarConvert scalar_convert, const Source *buf, size_t len,
                    Destination *output) {
  const auto vr = vectorized_convert(buf, len, output);
  const size_t consumed = vr.input - buf;
  const size_t written = vr.output - output;
  if (vr.err != simdutf::error_code::SUCCESS) {
    if (vr.err == simdutf::error_code::OTHER) {
      // Vectorized procedure detected an error, but does not know
      // exact position. The scalar procedure rescan the portion of
      // input and figure out where the error is located.
      return scalar_convert(vr.input, len - consumed, vr.output);
    }
    return 0;
  }

  if (consumed == len) {
    return written;
  }

  const auto ret = scalar_convert(vr.input, len - consumed, vr.output);
  if (ret == 0) {
    return 0;
  }

  return written + ret;
}

/*
    Template `convert_with_errors_impl` implements generic conversion routine
    between different encodings. Procedure returns a `result` instance ---
    please refer to its documentation for details.

    Parameters:
    * VectorizedConvert - vectorized procedure that returns structure having
      three fields: error_code (err), const Source* (input), Destination*
   (output)
    * ScalarConvert - scalar procedure that carries on conversion of tail
    * Source - type of input char (like char16_t, char)
    * Destination - type of input char
*/
template <typename VectorizedConvert, typename ScalarConvert, typename Source,
          typename Destination>
simdutf::result convert_with_errors_impl(VectorizedConvert vectorized_convert,
                                         ScalarConvert scalar_convert,
                                         const Source *buf, size_t len,
                                         Destination *output) {

  const auto vr = vectorized_convert(buf, len, output);
  const size_t consumed = vr.input - buf;
  const size_t written = vr.output - output;
  if (vr.err != simdutf::error_code::SUCCESS) {
    if (vr.err == simdutf::error_code::OTHER) {
      // Vectorized procedure detected an error, but does not know
      // exact position. The scalar procedure rescan the portion of
      // input and figure out where the error is located.
      auto sr = scalar_convert(vr.input, len - consumed, vr.output);
      sr.count += consumed;
      return sr;
    }
    return simdutf::result(vr.err, consumed);
  }

  if (consumed == len) {
    return simdutf::result(simdutf::error_code::SUCCESS, written);
  }

  simdutf::result sr = scalar_convert(vr.input, len - consumed, vr.output);
  if (sr.is_ok()) {
    sr.count += written;
  } else {
    sr.count += consumed;
  }

  return sr;
}
/* end file src/ppc64/templates.cpp */

#ifdef SIMDUTF_INTERNAL_TESTS
  #if SIMDUTF_FEATURE_BASE64
    #include "ppc64_base64_internal_tests.cpp"
  #endif // SIMDUTF_FEATURE_BASE64
#endif   // SIMDUTF_INTERNAL_TESTS
//
// Implementation-specific overrides
//
namespace simdutf {
namespace ppc64 {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return ppc64::utf8_validation::generic_validate_utf8(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return ppc64::ascii_validation::generic_validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return ppc64::ascii_validation::generic_validate_ascii_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  return utf32::validate(buf, len);
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  return utf32::validate_with_errors(buf, len);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  const auto ret = ppc64_convert_latin1_to_utf8(buf, len, utf8_output);
  size_t converted_chars = ret.second - utf8_output;

  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }

  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  const auto ret = ppc64_convert_latin1_to_utf32(buf, len, utf32_output);
  if (ret.first != buf + len) {
    const size_t processed = ret.first - buf;
    scalar::latin1_to_utf32::convert(ret.first, len - processed, ret.second);
  }

  return len;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return ppc64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return convert_impl(ppc64_convert_utf32_to_latin1<ErrorChecking::enabled>,
                      scalar::utf32_to_latin1::convert, buf, len,
                      latin1_output);
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return convert_with_errors_impl(
      ppc64_convert_utf32_to_latin1<ErrorChecking::enabled>,
      scalar::utf32_to_latin1::convert_with_errors, buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  return convert_impl(ppc64_convert_utf32_to_latin1<ErrorChecking::disabled>,
                      scalar::utf32_to_latin1::convert, buf, len,
                      latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return convert_impl(ppc64_convert_utf32_to_utf8<ErrorReporting::at_the_end>,
                      scalar::utf32_to_utf8::convert<const char32_t *, char *>,
                      buf, len, utf8_output);
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return convert_with_errors_impl(
      ppc64_convert_utf32_to_utf8<ErrorReporting::precise>,
      scalar::utf32_to_utf8::convert_with_errors<const char32_t *, char *>, buf,
      len, utf8_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return convert_impl(ppc64_convert_utf32_to_utf8<ErrorReporting::none>,
                      scalar::utf32_to_utf8::convert<const char32_t *, char *>,
                      buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t length) const noexcept {
  const auto ret = ppc64_utf8_length_from_latin1(input, length);
  const size_t consumed = ret.first - input;

  if (consumed == length) {
    return ret.second;
  }

  const auto scalar =
      scalar::latin1::utf8_length_from_latin1(ret.first, length - consumed);
  return scalar + ret.second;
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return scalar::base64::maximal_binary_length_from_base64(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64<true>(output, input, length, options);
  } else {
    return encode_base64<false>(output, input, length, options);
  }
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64_impl<true>(output, input, length,
                                                       options, line_length);
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util::find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util::find(start, end, character);
}

#ifdef SIMDUTF_INTERNAL_TESTS
std::vector<implementation::TestProcedure>
implementation::internal_tests() const {
  #define entry(proc)                                                          \
    TestProcedure { #proc, proc }
  return {entry(base64_encoding_translate_6bit_values),
          entry(base64_encoding_expand_6bit_fields),
          entry(base64_decoding_valid),
          entry(base64_decoding_invalid_ignore_errors),
          entry(base64url_decoding_invalid_ignore_errors),
          entry(base64_decoding_invalid_strict_errors),
          entry(base64url_decoding_invalid_strict_errors),
          entry(base64_decoding_pack),
          entry(base64_compress)};
  #undef entry
}
#endif

} // namespace ppc64
} // namespace simdutf

/* begin file src/simdutf/ppc64/end.h */
/* end file src/simdutf/ppc64/end.h */
/* end file src/ppc64/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_RVV
/* begin file src/rvv/implementation.cpp */
/* begin file src/simdutf/rvv/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "rvv"
// #define SIMDUTF_IMPLEMENTATION rvv

#if SIMDUTF_CAN_ALWAYS_RUN_RVV
// nothing needed.
#else
SIMDUTF_TARGET_RVV
#endif
/* end file src/simdutf/rvv/begin.h */
namespace simdutf {
namespace rvv {
namespace {
#ifndef SIMDUTF_RVV_H
  #error "rvv.h must be included"
#endif

} // unnamed namespace
} // namespace rvv
} // namespace simdutf

//
// Implementation-specific overrides
//
namespace simdutf {
namespace rvv {
/* begin file src/rvv/rvv_helpers.inl.cpp */
template <simdutf_ByteFlip bflip>
simdutf_really_inline static size_t
rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
                         vbool4_t m4even) {
  /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
   * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */
  vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
  sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
                             __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
  sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
  sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
  /* merge 1 byte utf32 and 2 byte sur */
  vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
  vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
      __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
  /* compress and store */
  vbool4_t mOut = __riscv_vmor_mm_b4(
      __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
  vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
  vl = __riscv_vcpop_m_b4(mOut, vl * 2);
  __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
  return vl;
};
/* end file src/rvv/rvv_helpers.inl.cpp */

/* begin file src/rvv/rvv_length_from.inl.cpp */
simdutf_warn_unused size_t
implementation::count_utf8(const char *src, size_t len) const noexcept {
  return utf32_length_from_utf8(src, len);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *src, size_t len) const noexcept {
  return utf32_length_from_utf8(src, len);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *src, size_t len) const noexcept {
  size_t count = 0;
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e8m8(len);
    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
    vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
    count += __riscv_vcpop_m_b1(mask, vl);
  }
  return count;
}

template <simdutf_ByteFlip bflip>
simdutf_really_inline static size_t
rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
  size_t count = 0;
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e16m8(len);
    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
    v = simdutf_byteflip<bflip>(v, vl);
    vbool2_t notHigh =
        __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
                           __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
    count += __riscv_vcpop_m_b2(notHigh, vl);
  }
  return count;
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
    const char16_t *src, size_t len) const noexcept {
  return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
    const char16_t *src, size_t len) const noexcept {
  if (supports_zvbb())
    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
  else
    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *src, size_t len) const noexcept {
  size_t count = len;
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e8m8(len);
    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
    count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
  }
  return count;
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *src, size_t len) const noexcept {
  size_t count = 0;
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e32m8(len);
    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
    vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
    vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
    count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
             __riscv_vcpop_m_b4(m4, vl);
  }
  return count;
}

/* end file src/rvv/rvv_length_from.inl.cpp */
/* begin file src/rvv/rvv_validate.inl.cpp */
simdutf_warn_unused bool
implementation::validate_ascii(const char *src, size_t len) const noexcept {
  size_t vlmax = __riscv_vsetvlmax_e8m8();
  vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e8m8(len);
    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
    mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
  }
  return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
         0;
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *src, size_t len) const noexcept {
  const char *beg = src;
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e8m8(len);
    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
    long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
    if (idx >= 0)
      return result(error_code::TOO_LARGE, src - beg + idx);
  }
  return result(error_code::SUCCESS, src - beg);
}
/* Returns a close estimation of the number of valid UTF-8 bytes up to the
 * first invalid one, but never overestimating. */
simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
                                                         size_t len) {
  const char *beg = src;
  if (len < 32)
    return 0;

  /* validate first three bytes */
  {
    size_t idx = 3;
    while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10)
      ++idx;
    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
      return 0;
  }

  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};

  const vuint8m1_t err1tbl =
      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
  const vuint8m1_t err2tbl =
      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
  const vuint8m1_t err3tbl =
      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));

  size_t tail = 3;
  size_t n = len - tail;

  for (size_t vl; n > 0; n -= vl, src += vl) {
    vl = __riscv_vsetvl_e8m4(n);
    vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);

    uint8_t next0 = src[vl + 0];
    uint8_t next1 = src[vl + 1];
    uint8_t next2 = src[vl + 2];

    /* fast path: ASCII */
    if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
            0 &&
        (next0 | next1 | next2) < 0b10000000)
      continue;

    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
     * https://arxiv.org/abs/2010.03090 */
    vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
    vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);

    vuint8m4_t v2_hi_nibble = __riscv_vsrl_vx_u8m4(v2, 4, vl);
    vuint8m4_t v3_hi_nibble =
        __riscv_vslide1down_vx_u8m4(v2_hi_nibble, next2 >> 4, vl);

    vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
    vuint8m4_t idx1 = v2_hi_nibble;
    vuint8m4_t idx3 = v3_hi_nibble;

    vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
    vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
    vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
    vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
        __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));

    vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
    vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
    vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
    vbool2_t err34 =
        __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
    vbool2_t errm =
        __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
    if (__riscv_vfirst_m_b2(errm, vl) >= 0)
      break;
  }

  /* we need to validate the last character */
  while (tail < len && (uint8_t(src[0]) >> 6) == 0b10)
    --src, ++tail;
  return src - beg;
}

simdutf_warn_unused bool
implementation::validate_utf8(const char *src, size_t len) const noexcept {
  size_t count = rvv_count_valid_utf8(src, len);
  return scalar::utf8::validate(src + count, len - count);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *src, size_t len) const noexcept {
  size_t count = rvv_count_valid_utf8(src, len);
  result res = scalar::utf8::validate_with_errors(src + count, len - count);
  return result(res.error, count + res.count);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
  size_t vlmax = __riscv_vsetvlmax_e32m8();
  vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
  vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e32m8(len);
    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
    max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
    maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
  }
  return __riscv_vfirst_m_b4(
             __riscv_vmor_mm_b4(
                 __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
                 __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
             vlmax) < 0;
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *src, size_t len) const noexcept {
  const char32_t *beg = src;
  for (size_t vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e32m8(len);
    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
    long idx1 =
        __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
    long idx2 = __riscv_vfirst_m_b4(
        __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
    if (idx1 >= 0 && idx2 >= 0) {
      if (idx1 <= idx2) {
        return result(error_code::TOO_LARGE, src - beg + idx1);
      } else {
        return result(error_code::SURROGATE, src - beg + idx2);
      }
    }
    if (idx1 >= 0) {
      return result(error_code::TOO_LARGE, src - beg + idx1);
    }
    if (idx2 >= 0) {
      return result(error_code::SURROGATE, src - beg + idx2);
    }
  }
  return result(error_code::SUCCESS, src - beg);
}
/* end file src/rvv/rvv_validate.inl.cpp */

/* begin file src/rvv/rvv_latin1_to.inl.cpp */
simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *src, size_t len, char *dst) const noexcept {
  char *beg = dst;
  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
    vl = __riscv_vsetvl_e8m2(len);
    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
    vbool4_t nascii =
        __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
    size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
    vlOut = vl + cnt;
    if (cnt == 0) {
      __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
      continue;
    }

    vuint8m2_t v0 =
        __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
    v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);

    vuint8m4_t wide =
        __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
            __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
    vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
        __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
    vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);

    __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
  }
  return dst - beg;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *src, size_t len, char32_t *dst) const noexcept {
  char32_t *beg = dst;
  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
    vl = __riscv_vsetvl_e8m2(len);
    vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
    __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
  }
  return dst - beg;
}
/* end file src/rvv/rvv_latin1_to.inl.cpp */
/* begin file src/rvv/rvv_utf16_to.inl.cpp */
/* end file src/rvv/rvv_utf16_to.inl.cpp */

/* begin file src/rvv/rvv_utf32_to.inl.cpp */
simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *src, size_t len, char *dst) const noexcept {
  result res = convert_utf32_to_latin1_with_errors(src, len, dst);
  return res.error == error_code::SUCCESS ? res.count : 0;
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *src, size_t len, char *dst) const noexcept {
  const char32_t *const beg = src;
  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
    vl = __riscv_vsetvl_e32m8(len);
    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
    long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
    if (idx >= 0)
      return result(error_code::TOO_LARGE, src - beg + idx);
    /* We don't use vcompress here, because its performance varies widely on
     * current platforms. This might be worth reconsidering once there is more
     * hardware available. */
    __riscv_vse8_v_u8m2(
        (uint8_t *)dst,
        __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
  }
  return result(error_code::SUCCESS, src - beg);
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *src, size_t len, char *dst) const noexcept {
  return convert_utf32_to_latin1(src, len, dst);
}

template <bool with_validation>
simdutf_warn_unused result convert_utf32_to_utf8_aux(const char32_t *src,
                                                     size_t len,
                                                     char *dst) noexcept {
  size_t n = len;
  const char32_t *srcBeg = src;
  const char *dstBeg = dst;
  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);

  for (size_t vl, vlOut; n > 0;) {
    vl = __riscv_vsetvl_e32m4(n);

    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
    vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
    vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);

    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
      vlOut = vl;
      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
                          vlOut);
      n -= vl, src += vl, dst += vlOut;
      continue;
    }

    vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);

    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
      /* 0: [     aaa|aabbbbbb]
       * 1: [aabbbbbb|        ] vsll 8
       * 2: [        |   aaaaa] vsrl 6
       * 3: [00111111|00111111]
       * 4: [  bbbbbb|000aaaaa] (1|2)&3
       * 5: [10000000|11000000]
       * 6: [10bbbbbb|110aaaaa] 4|5 */
      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
                               __riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
          0b0011111100111111, vl);
      vuint16m2_t vout16 =
          __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);

      /* Every high byte that is zero should be compressed
       * low bytes should never be compressed, so we set them
       * to all ones, and then create a non-zero bytes mask */
      vbool4_t mcomp =
          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
                                   0, vl * 2);
      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);

      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);

      n -= vl, src += vl, dst += vlOut;
      continue;
    }

    if (with_validation) {
      const long idx1 =
          __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
      vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
          __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
      const long idx2 = __riscv_vfirst_m_b8(sur, vl);
      if (idx1 >= 0 || idx2 >= 0) {
        if (static_cast<unsigned long>(idx1) <=
            static_cast<unsigned long>(idx2)) {
          return result(error_code::TOO_LARGE, src - srcBeg + idx1);
        } else {
          return result(error_code::SURROGATE, src - srcBeg + idx2);
        }
      }
    }

    vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
    long first = __riscv_vfirst_m_b8(m4, vl);
    size_t tail = vl - first;
    vl = first < 0 ? vl : first;

    if (vl > 0) { /* 1/2/3 byte utf8 */
      /* vn: [aaaabbbb|bbcccccc]
       * v1: [0bcccccc|        ] vsll  8
       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
       * [10cccccc]
       */
      vuint16m2_t v1, v2, v3, v12;
      v1 = __riscv_vor_vx_u16m2_mu(
          m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);

      v2 = __riscv_vor_vx_u16m2(
          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
                                vl),
          0b10000000, vl);
      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
                                   0b01000000, vl);
      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
                                vl);
      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);

      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);

      vbool2_t mcomp = __riscv_vmor_mm_b2(
          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);

      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);

      n -= vl, src += vl, dst += vlOut;
    }

    if (tail)
      while (n) {
        uint32_t word = src[0];
        if (word < 0x10000)
          break;
        if (word > 0x10FFFF)
          return result(error_code::TOO_LARGE, src - srcBeg);
        *dst++ = (uint8_t)((word >> 18) | 0b11110000);
        *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
        *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
        *dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
        ++src;
        --n;
      }
  }

  return result(error_code::SUCCESS, dst - dstBeg);
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *src, size_t len, char *dst) const noexcept {
  constexpr bool with_validation = true;
  return convert_utf32_to_utf8_aux<with_validation>(src, len, dst);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *src, size_t len, char *dst) const noexcept {
  result res = convert_utf32_to_utf8_with_errors(src, len, dst);
  return res.error == error_code::SUCCESS ? res.count : 0;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *src, size_t len, char *dst) const noexcept {
  constexpr bool with_validation = false;
  const auto res = convert_utf32_to_utf8_aux<with_validation>(src, len, dst);
  return res.count;
}

/* end file src/rvv/rvv_utf32_to.inl.cpp */
/* begin file src/rvv/rvv_utf8_to.inl.cpp */
template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
                                                       size_t len, Tdst *dst) {
  static_assert(std::is_same<Tdst, uint16_t>() ||
                    std::is_same<Tdst, uint32_t>(),
                "invalid type");
  constexpr bool is16 = std::is_same<Tdst, uint16_t>();
  constexpr endianness endian =
      bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
  const auto scalar = [](char const *in, size_t count, Tdst *out) {
    return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
                                                         (char16_t *)out)
                : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
  };

  if (len < 32)
    return scalar(src, len, dst);

  /* validate first three bytes */
  if (validate) {
    size_t idx = 3;
    while (idx < len && (uint8_t(src[idx]) >> 6) == 0b10)
      ++idx;
    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
      return 0;
  }

  size_t tail = 3;
  size_t n = len - tail;
  Tdst *beg = dst;

  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};

  const vuint8m1_t err1tbl =
      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
  const vuint8m1_t err2tbl =
      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
  const vuint8m1_t err3tbl =
      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));

  size_t vl8m1 = __riscv_vsetvlmax_e8m1();
  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);

  for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
    vl = __riscv_vsetvl_e8m2(n);

    vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
    uint64_t max = __riscv_vmv_x_s_u8m1_u8(
        __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));

    uint8_t next0 = src[vl + 0];
    uint8_t next1 = src[vl + 1];
    uint8_t next2 = src[vl + 2];

    /* fast path: ASCII */
    if ((max | next0 | next1 | next2) < 0b10000000) {
      vlOut = vl;
      if (is16)
        __riscv_vse16_v_u16m4(
            (uint16_t *)dst,
            simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
            vlOut);
      else
        __riscv_vse32_v_u32m8((uint32_t *)dst,
                              __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
      continue;
    }

    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
     * https://arxiv.org/abs/2010.03090 */
    vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
    vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
    vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);

    if (validate) {
      vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
      vuint8m2_t idx1 = __riscv_vsrl_vx_u8m2(v2, 4, vl);
      vuint8m2_t idx3 = __riscv_vsrl_vx_u8m2(v3, 4, vl);

      vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
      vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
      vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
      vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
          __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));

      vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
      vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
      vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
      vbool4_t err34 =
          __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
      vbool4_t errm =
          __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
      if (__riscv_vfirst_m_b4(errm, vl) >= 0)
        return 0;
    }

    /* decoding */

    /* mask of non continuation bytes */
    vbool4_t m =
        __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
    vlOut = __riscv_vcpop_m_b4(m, vl);

    /* extract first and second bytes */
    vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
    vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);

    /* fast path: one and two byte */
    if (max < 0b11100000) {
      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);

      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
      b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);

      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
          b1,
          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
                                  vlOut),
          vlOut);
      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
      if (is16)
        __riscv_vse16_v_u16m4((uint16_t *)dst,
                              simdutf_byteflip<bflip>(b12, vlOut), vlOut);
      else
        __riscv_vse32_v_u32m8((uint32_t *)dst,
                              __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
      continue;
    }

    /* fast path: one, two and three byte */
    if (max < 0b11110000) {
      vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);

      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
      b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);

      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
      vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);

      vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
      b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);

      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
          b1,
          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
                                  vlOut),
          vlOut);
      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
      vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
          m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
      if (is16)
        __riscv_vse16_v_u16m4((uint16_t *)dst,
                              simdutf_byteflip<bflip>(b123, vlOut), vlOut);
      else
        __riscv_vse32_v_u32m8((uint32_t *)dst,
                              __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
      continue;
    }

    /* extract third and fourth bytes */
    vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
    vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);

    /* remove prefix from leading bytes
     *
     * We could also use vrgather here, but it increases register pressure,
     * and its performance varies widely on current platforms. It might be
     * worth reconsidering, though, once there is more hardware available.
     * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
     *
     * We shift left and then right by the number of bytes in the prefix,
     * which can be calculated as follows:
     *         x                                max(x-10, 0)
     * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
     * 10xx -> 1000-1011 -> don't care
     * 110x -> 1100,1101 -> sift by 3        -> 2,3
     * 1110 -> 1110      -> sift by 4        -> 4
     * 1111 -> 1111      -> sift by 5        -> 5
     *
     * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
     * just need to manually detect and handle the one special case:
     */
  #define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx)                                   \
    vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx);                         \
    vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx);                         \
    vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx);                         \
    vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx);                         \
    /* remove prefix from trailing bytes */                                    \
    c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut);                          \
    c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut);                          \
    c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut);                          \
    vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut);                     \
    shift = __riscv_vmerge_vxm_u8m1(                                           \
        __riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3,                           \
        __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut);                    \
    c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut);                               \
    c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut);                               \
    /* unconditionally widen and combine to c1234 */                           \
    vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(                                 \
        __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut);                \
    vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(                                 \
        __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut);                \
    vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(                               \
        __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut);             \
    /* derive required right-shift amount from `shift` to reduce               \
     * c1234 to the required number of bytes */                                \
    c1234 = __riscv_vsrl_vv_u32m4(                                             \
        c1234,                                                                 \
        __riscv_vzext_vf4_u32m4(                                               \
            __riscv_vmul_vx_u8m1(                                              \
                __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut), \
                                      3, vlOut),                               \
                6, vlOut),                                                     \
            vlOut),                                                            \
        vlOut);                                                                \
    /* store result in desired format */                                       \
    if (is16)                                                                  \
      vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut,   \
                                              m4even);                         \
    else                                                                       \
      vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);

    /* Unrolling this manually reduces register pressure and allows
     * us to terminate early. */
    {
      size_t vlOutm2 = vlOut, vlDst;
      vlOut = __riscv_vsetvl_e8m1(vlOut < vl8m1 ? vlOut : vl8m1);
      SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
      if (vlOutm2 == vlOut) {
        vlOut = vlDst;
        continue;
      }

      dst += vlDst;
      vlOut = vlOutm2 - vlOut;
    }
    {
      size_t vlDst;
      SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
      vlOut = vlDst;
    }

  #undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
  }

  /* validate the last character and reparse it + tail */
  if (len > tail) {
    if ((uint8_t(src[0]) >> 6) == 0b10)
      --dst;
    while ((uint8_t(src[0]) >> 6) == 0b10 && tail < len)
      --src, ++tail;
    if (is16) {
      /* go back one more, when on high surrogate */
      if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
          simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
        --dst;
    }
  }
  size_t ret = scalar(src, tail, dst);
  if (ret == 0)
    return 0;
  return (size_t)(dst - beg) + ret;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *src, size_t len, char *dst) const noexcept {
  const char *beg = dst;
  uint8_t last = 0;
  for (size_t vl, vlOut; len > 0;
       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
    vl = __riscv_vsetvl_e8m2(len);
    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
    // check which bytes are ASCII
    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
    // count ASCII bytes
    vlOut = __riscv_vcpop_m_b4(ascii, vl);
    // The original code would only enter the next block after this check:
    //   vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
    //   vlOut = __riscv_vcpop_m_b4(m, vl);
    //   if (vlOut != vl || last > 0b01111111) {...}q
    // So that everything is ASCII or continuation bytes, we just proceeded
    // without any processing, going straight to __riscv_vse8_v_u8m2.
    // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
    // byte.
    if (vlOut != vl) { // If not pure ASCII
      // Non-ASCII characters
      // We now want to mark the ascii and continuation bytes
      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
      // We count them, that's our new vlOut (output vector length)
      vlOut = __riscv_vcpop_m_b4(m, vl);

      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);

      vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
      vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
          __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
      // -62 i 0b11000010, so we check whether any of v0 is too big
      vbool4_t tobig = __riscv_vmand_mm_b4(
          leading0,
          __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
                                    1, vl),
          vl);
      if (__riscv_vfirst_m_b4(
              __riscv_vmor_mm_b4(
                  tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
              vl) >= 0)
        return 0;

      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
                                  v1, v1, 0b01000000, vl);
      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
    } else if (last >= 0b11000000) { // If last byte is a leading  byte and we
                                     // got only ASCII, error!
      return 0;
    }
    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
  }
  if (last > 0b10111111)
    return 0;
  return dst - beg;
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *src, size_t len, char *dst) const noexcept {
  size_t res = convert_utf8_to_latin1(src, len, dst);
  if (res)
    return result(error_code::SUCCESS, res);
  return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *src, size_t len, char *dst) const noexcept {
  const char *beg = dst;
  uint8_t last = 0;
  for (size_t vl, vlOut; len > 0;
       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
    vl = __riscv_vsetvl_e8m2(len);
    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
    vlOut = __riscv_vcpop_m_b4(ascii, vl);
    if (vlOut != vl) { // If not pure ASCII
      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
      vlOut = __riscv_vcpop_m_b4(m, vl);
      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
                                  v1, v1, 0b01000000, vl);
      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
    }
    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
  }
  return dst - beg;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *src, size_t len, char32_t *dst) const noexcept {
  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
                                                              (uint32_t *)dst);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *src, size_t len, char32_t *dst) const noexcept {
  size_t res = convert_utf8_to_utf32(src, len, dst);
  if (res)
    return result(error_code::SUCCESS, res);
  return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *src, size_t len, char32_t *dst) const noexcept {
  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
      src, len, (uint32_t *)dst);
}
/* end file src/rvv/rvv_utf8_to.inl.cpp */

/* begin file src/rvv/rvv_find.cpp */
const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  const char *src = start;
  for (size_t len = end - start, vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e8m8(len);
    vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t *)src, vl);
    long idx =
        __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, character, vl), vl);
    if (idx >= 0)
      return src + idx;
  }
  return end;
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  const char16_t *src = start;
  for (size_t len = end - start, vl; len > 0; len -= vl, src += vl) {
    vl = __riscv_vsetvl_e16m8(len);
    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
    long idx =
        __riscv_vfirst_m_b2(__riscv_vmseq_vx_u16m8_b2(v, character, vl), vl);
    if (idx >= 0)
      return src + idx;
  }
  return end;
}
/* end file src/rvv/rvv_find.cpp */

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  return simdutf::scalar::base64::base64_to_binary_details_impl(
      input, length, output, options, last_chunk_options);
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64(output, input, length, options);
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64_impl<true>(output, input, length,
                                                       options, line_length);
}

} // namespace rvv
} // namespace simdutf

/* begin file src/simdutf/rvv/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_RVV
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif

/* end file src/simdutf/rvv/end.h */
/* end file src/rvv/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_WESTMERE
/* begin file src/westmere/implementation.cpp */
/* begin file src/simdutf/westmere/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "westmere"
// #define SIMDUTF_IMPLEMENTATION westmere
#define SIMDUTF_SIMD_HAS_BYTEMASK 1

#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
// nothing needed.
#else
SIMDUTF_TARGET_WESTMERE
#endif
/* end file src/simdutf/westmere/begin.h */

namespace simdutf {
namespace westmere {
namespace {
#ifndef SIMDUTF_WESTMERE_H
  #error "westmere.h must be included"
#endif
using namespace simd;

simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
  return input.reduce_or().is_ascii();
}

simdutf_really_inline simd8<bool>
must_be_2_3_continuation(const simd8<uint8_t> prev2,
                         const simd8<uint8_t> prev3) {
  simd8<uint8_t> is_third_byte =
      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
  simd8<uint8_t> is_fourth_byte =
      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
  return simd8<bool>(is_third_byte | is_fourth_byte);
}

/* begin file src/westmere/internal/loader.cpp */
namespace internal {
namespace westmere {

/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
/*
 * reads a vector of uint16 values
 * bits after 11th are ignored
 * first 11 bits are encoded into utf8
 * !important! utf8_output must have at least 16 writable bytes
 */

inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
                                       const __m128i one_byte_bytemask,
                                       const uint16_t one_byte_bitmask) {
  // 0b1100_0000_1000_0000
  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
  // 0b0001_1111_0000_0000
  const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
  // 0b0000_0000_0011_1111
  const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);

  // 1. prepare 2-byte values
  // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
  // expected output   : [110a|aaaa|10bb|bbbb] x 8

  // t0 = [000a|aaaa|bbbb|bb00]
  const __m128i t0 = _mm_slli_epi16(v_u16, 2);
  // t1 = [000a|aaaa|0000|0000]
  const __m128i t1 = _mm_and_si128(t0, v_1f00);
  // t2 = [0000|0000|00bb|bbbb]
  const __m128i t2 = _mm_and_si128(v_u16, v_003f);
  // t3 = [000a|aaaa|00bb|bbbb]
  const __m128i t3 = _mm_or_si128(t1, t2);
  // t4 = [110a|aaaa|10bb|bbbb]
  const __m128i t4 = _mm_or_si128(t3, v_c080);

  // 2. merge ASCII and 2-byte codewords
  const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);

  // 3. prepare bitmask for 8-bit lookup
  //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
  //    - LSB)
  const uint16_t m0 = one_byte_bitmask & 0x5555;      // m0 = 0h0g0f0e0d0c0b0a
  const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
  const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
  // 4. pack the bytes
  const uint8_t *row =
      &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
  const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
  const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);

  // 5. store bytes
  _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);

  // 6. adjust pointers
  utf8_output += row[0];
}

inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
                                       const __m128i v_0000,
                                       const __m128i v_ff80) {
  // no bits set above 7th bit
  const __m128i one_byte_bytemask =
      _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
  const uint16_t one_byte_bitmask =
      static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));

  write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
                             one_byte_bitmask);
}
/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */

} // namespace westmere
} // namespace internal
/* end file src/westmere/internal/loader.cpp */

/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
std::pair<const char *const, char *const>
sse_convert_latin1_to_utf8(const char *latin_input,
                           const size_t latin_input_length, char *utf8_output) {
  const char *end = latin_input + latin_input_length;

  const __m128i v_0000 = _mm_setzero_si128();
  // 0b1000_0000
  const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
  // 0b1111_1111_1000_0000
  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);

  const __m128i latin_1_half_into_u16_byte_mask =
      _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
                    '\x80', 6, '\x80', 7, '\x80');

  const __m128i latin_2_half_into_u16_byte_mask =
      _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
                    13, '\x80', 14, '\x80', 15, '\x80');

  // each latin1 takes 1-2 utf8 bytes
  // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
  // adjust the pointer) so the last write can exceed the utf8_output size by
  // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
  // 8-16 bytes free
  while (end - latin_input >= 16 + 8) {
    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);

    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
      latin_input += 16;
      utf8_output += 16;
      continue;
    }

    // assuming a/b are bytes and A/B are uint16 of the same value
    // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
    __m128i v_u16_latin_1_half =
        _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
    // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
    __m128i v_u16_latin_2_half =
        _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);

    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
                                                   utf8_output, v_0000, v_ff80);
    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
                                                   utf8_output, v_0000, v_ff80);
    latin_input += 16;
  }

  if (end - latin_input >= 16) {
    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);

    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
      latin_input += 16;
      utf8_output += 16;
    } else {
      // assuming a/b are bytes and A/B are uint16 of the same value
      // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
      __m128i v_u16_latin_1_half =
          _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
      internal::westmere::write_v_u16_11bits_to_utf8(
          v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
      latin_input += 8;
    }
  }

  return std::make_pair(latin_input, utf8_output);
}
/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */

/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
std::pair<const char *, char32_t *>
sse_convert_latin1_to_utf32(const char *buf, size_t len,
                            char32_t *utf32_output) {
  const char *end = buf + len;

  while (end - buf >= 16) {
    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
    __m128i in = _mm_loadu_si128((__m128i *)buf);

    // Shift input to process next 4 bytes
    __m128i in_shifted1 = _mm_srli_si128(in, 4);
    __m128i in_shifted2 = _mm_srli_si128(in, 8);
    __m128i in_shifted3 = _mm_srli_si128(in, 12);

    // expand 8-bit to 32-bit unit
    __m128i out1 = _mm_cvtepu8_epi32(in);
    __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
    __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
    __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);

    _mm_storeu_si128((__m128i *)utf32_output, out1);
    _mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
    _mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
    _mm_storeu_si128((__m128i *)(utf32_output + 12), out4);

    utf32_output += 16;
    buf += 16;
  }

  return std::make_pair(buf, utf32_output);
}
/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */

/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
// depends on "tables/utf8_to_utf16_tables.h"

// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  const __m128i in = _mm_loadu_si128((__m128i *)input);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xfff;
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process the data in chunks of 12 bytes.
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
                     _mm_cvtepu8_epi32(in));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
    utf32_output += 12; // We wrote 12 32-bit characters.
    return 12;          // We consumed 12 bytes.
  }
  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
    // UTF-32 code units. There is probably a more efficient sequence, but the
    // following might do.
    const __m128i sh =
        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
                     _mm_cvtepu16_epi32(composed));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
    utf32_output += 8; // We wrote 32 bytes, 8 code points.
    return 16;
  }
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units. There is probably a more efficient sequence, but the
    // following might do.
    const __m128i sh =
        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii =
        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
    const __m128i middlebyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
    const __m128i highbyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
    const __m128i composed =
        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
    _mm_storeu_si128((__m128i *)utf32_output, composed);
    utf32_output += 4;
    return 12;
  }
  /// We do not have a fast path available, so we fallback.

  const uint8_t idx =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
  if (idx < 64) {
    // SIX (6) input code-code units
    // this is a relatively easy scenario
    // we process SIX (6) input code-code units. The max length in bytes of six
    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
    // processors where pdep/pext is fast, we might be able to use a small
    // lookup table.
    const __m128i sh =
        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
                     _mm_cvtepu16_epi32(composed));
    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
    utf32_output += 6; // We wrote 12 bytes, 6 code points.
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    const __m128i sh =
        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii =
        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
    const __m128i middlebyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
    const __m128i highbyte =
        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
    const __m128i composed =
        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
    _mm_storeu_si128((__m128i *)utf32_output, composed);
    utf32_output += 4;
  } else if (idx < 209) {
    // TWO (2) input code-code units
    const __m128i sh =
        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
    const __m128i perm = _mm_shuffle_epi8(in, sh);
    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
    // correct for spurious high bit
    const __m128i correct =
        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
    const __m128i composed =
        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
    _mm_storeu_si128((__m128i *)utf32_output, composed);
    utf32_output += 3;
  } else {
    // here we know that there is an error but we do not handle errors
  }
  return consumed;
}
/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */

/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
// depends on "tables/utf8_to_utf16_tables.h"

// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_latin1(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
                                     char *&latin1_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  const __m128i in = _mm_loadu_si128((__m128i *)input);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask &
      0xfff; // we are only processing 12 bytes in case it is not all ASCII
  if (utf8_end_of_code_point_mask == 0xfff) {
    // We process the data in chunks of 12 bytes.
    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
    latin1_output += 12; // We wrote 12 characters.
    return 12;           // We consumed 12 bytes.
  }
  /// We do not have a fast path available, so we fallback.
  const uint8_t idx =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed =
      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
  // this indicates an invalid input:
  if (idx >= 64) {
    return consumed;
  }
  // Here we should have (idx < 64), if not, there is a bug in the validation or
  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
  // processors where pdep/pext is fast, we might be able to use a small lookup
  // table.
  const __m128i sh =
      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
  const __m128i perm = _mm_shuffle_epi8(in, sh);
  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
  // writing 8 bytes even though we only care about the first 6 bytes.
  // performance note: it would be faster to use _mm_storeu_si128, we should
  // investigate.
  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
  latin1_output += 6; // We wrote 6 bytes.
  return consumed;
}
/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */

/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
std::pair<const char32_t *, char *>
sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
                            char *latin1_output) {
  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16

  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
  __m128i shufmask =
      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);

  for (size_t i = 0; i < rounded_len; i += 16) {
    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));

    __m128i check_combined = _mm_or_si128(in1, in2);
    check_combined = _mm_or_si128(check_combined, in3);
    check_combined = _mm_or_si128(check_combined, in4);

    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
      return std::make_pair(nullptr, latin1_output);
    }
    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
                                       _mm_shuffle_epi8(in2, shufmask));
    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
                                       _mm_shuffle_epi8(in4, shufmask));
    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
    _mm_storeu_si128((__m128i *)latin1_output, pack);
    latin1_output += 16;
    buf += 16;
  }

  return std::make_pair(buf, latin1_output);
}

std::pair<result, char *>
sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                        char *latin1_output) {
  const char32_t *start = buf;
  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16

  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
  __m128i shufmask =
      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);

  for (size_t i = 0; i < rounded_len; i += 16) {
    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));

    __m128i check_combined = _mm_or_si128(in1, in2);
    check_combined = _mm_or_si128(check_combined, in3);
    check_combined = _mm_or_si128(check_combined, in4);

    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
      // Fallback to scalar code for handling errors
      for (int k = 0; k < 16; k++) {
        char32_t codepoint = buf[k];
        if (codepoint <= 0xff) {
          *latin1_output++ = char(codepoint);
        } else {
          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                latin1_output);
        }
      }
      buf += 16;
      continue;
    }
    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
                                       _mm_shuffle_epi8(in2, shufmask));
    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
                                       _mm_shuffle_epi8(in4, shufmask));
    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
    _mm_storeu_si128((__m128i *)latin1_output, pack);
    latin1_output += 16;
    buf += 16;
  }

  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        latin1_output);
}
/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */

/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
std::pair<const char32_t *, char *>
sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
  const char32_t *end = buf + len;

  const __m128i v_0000 = _mm_setzero_si128();              //__m128 = 128 bits
  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
                                                           // 0000
  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
                                                           // 0000
  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
                                                           // 0000
  const __m128i v_ffff0000 = _mm_set1_epi32(
      (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
  const __m128i v_7fffffff = _mm_set1_epi32(
      (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
  __m128i running_max = _mm_setzero_si128();
  __m128i forbidden_bytemask = _mm_setzero_si128();
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >=
         std::ptrdiff_t(
             16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
                                    // has 4 bytes or 32 bits, thus buf + 16 *
                                    // char_32t = 512 bits = 64 bytes
    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
    __m128i in = _mm_loadu_si128((__m128i *)buf);
    __m128i nextin = _mm_loadu_si128(
        (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
    running_max = _mm_max_epu32(
        _mm_max_epu32(in, running_max), // take element-wise max char32_t from
                                        // in and running_max vector
        nextin); // and take element-wise max element from nextin and
                 // running_max vector

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    __m128i in_16 = _mm_packus_epi32(
        _mm_and_si128(in, v_7fffffff),
        _mm_and_si128(
            nextin,
            v_7fffffff)); // in this context pack the two __m128 into a single
    // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
    // all values are interpreted as non-negative, or specifically, the values
    // are within the range of valid Unicode code points. remember : having
    // leading byte 0 means a positive number by the two complements system.
    // Unicode is well beneath the range where you'll start getting issues so
    // that's OK.

    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp

    // Check for ASCII fast path

    // ASCII fast path!!!!
    // We eagerly load another 32 bytes, hoping that they will be ASCII too.
    // The intuition is that we try to collect 16 ASCII characters which
    // requires a total of 64 bytes of input. If we fail, we just pass thirdin
    // and fourthin as our new inputs.
    if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
      __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
      __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
      running_max = _mm_max_epu32(
          _mm_max_epu32(thirdin, running_max),
          fourthin); // take the running max of all 4 vectors thus far
      __m128i nextin_16 = _mm_packus_epi32(
          _mm_and_si128(thirdin, v_7fffffff),
          _mm_and_si128(fourthin,
                        v_7fffffff)); // pack into 1 vector, now you have two
      if (!_mm_testz_si128(
              nextin_16,
              v_ff80)) { // checks if the second packed vector is ASCII, if not:
        // 1. pack the bytes
        // obviously suboptimal.
        const __m128i utf8_packed = _mm_packus_epi16(
            in_16, in_16); // creates two copy of in_16 in 1 vector
        // 2. store (16 bytes)
        _mm_storeu_si128((__m128i *)utf8_output,
                         utf8_packed); // put them into the output
        // 3. adjust pointers
        buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
                  // bits =  256 bits
        utf8_output +=
            8; // same with output, e.g. lift the first two blocks alone.
        // Proceed with next input
        in_16 = nextin_16;
        // We need to update in and nextin because they are used later.
        in = thirdin;
        nextin = fourthin;
      } else {
        // 1. pack the bytes
        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
        // 2. store (16 bytes)
        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
        // 3. adjust pointers
        buf += 16;
        utf8_output += 16;
        continue; // we are done for this round!
      }
    }

    // no bits set above 7th bit -- find out all the ASCII characters
    const __m128i one_byte_bytemask =
        _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
            _mm_and_si128(in_16, v_ff80), // the vector that get only the first
                                          // 9 bits of each 16-bit/2-byte units
            v_0000                        //
        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
           // of format 0000 0000 0000 0XXX XXXX
    // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
    // 0000 0000 0000 0000 if not for each 16-bit/2-byte units
    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
        one_byte_bytemask)); // collect the MSB from previous vector and put
                             // them into uint16_t mas

    // no bits set above 11th bit
    const __m128i one_or_two_bytes_bytemask =
        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
    const uint16_t one_or_two_bytes_bitmask =
        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));

    if (one_or_two_bytes_bitmask == 0xffff) {
      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
      // produces 2 bytes)
      // 1. prepare 2-byte values
      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
      // expected output   : [110a|aaaa|10bb|bbbb] x 8
      const __m128i v_1f00 =
          _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
      const __m128i v_003f =
          _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111

      // t0 = [000a|aaaa|bbbb|bb00]
      const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
      // t1 = [000a|aaaa|0000|0000]
      const __m128i t1 = _mm_and_si128(t0, v_1f00); // potential first utf8 byte
      // t2 = [0000|0000|00bb|bbbb]
      const __m128i t2 =
          _mm_and_si128(in_16, v_003f); // potential second utf8 byte
      // t3 = [000a|aaaa|00bb|bbbb]
      const __m128i t3 =
          _mm_or_si128(t1, t2); // first and second potential utf8 byte together
      // t4 = [110a|aaaa|10bb|bbbb]
      const __m128i t4 = _mm_or_si128(
          t3,
          v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit

      // 2. merge ASCII and 2-byte codewords
      const __m128i utf8_unpacked =
          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);

      // 3. prepare bitmask for 8-bit lookup
      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
      //    MSB, a - LSB)
      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
      const uint16_t m1 =
          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
      const uint8_t m2 =
          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
      // 4. pack the bytes
      const uint8_t *row =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);

      // 5. store bytes
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);

      // 6. adjust pointers
      buf += 8;
      utf8_output += row[0];
      continue;
    }

    // Check for overflow in packing

    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
    const uint32_t saturation_bitmask =
        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
    if (saturation_bitmask == 0xffff) {
      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
      forbidden_bytemask =
          _mm_or_si128(forbidden_bytemask,
                       _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));

      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);

      /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
        single UFT-8 byte
          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
        two UTF-8 bytes
          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
        three UTF-8 bytes

        We expand the input word (16-bit) into two code units (32-bit), thus
        we have room for four bytes. However, we need five distinct bit
        layouts. Note that the last byte in cases #2 and #3 is the same.

        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
        in register t2.

        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
        either byte 1 for case #2 or byte 2 for case #3. Note that they
        differ by exactly one bit.

        Finally from these two code units we build proper UTF-8 sequence, taking
        into account the case (i.e, the number of bytes to write).
      */
      /**
       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
       * t2 => [0ccc|cccc] [10cc|cccc]
       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
       */
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));

      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
      const __m128i s0 = _mm_srli_epi16(in_16, 4);
      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
                                          simdutf_vec(0b0100000000000000));
      const __m128i s4 = _mm_xor_si128(s3, m0);
#undef simdutf_vec

      // 4. expand code units 16-bit => 32-bit
      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);

      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
      const uint16_t mask =
          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
      if (mask == 0) {
        // We only have three-byte code units. Use fast path.
        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
                                              15, 13, -1, -1, -1, -1);
        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
        utf8_output += 12;
        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
        utf8_output += 12;
        buf += 8;
        continue;
      }
      const uint8_t mask0 = uint8_t(mask);

      const uint8_t *row0 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);

      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);

      const uint8_t *row1 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);

      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
      utf8_output += row0[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
      utf8_output += row1[0];

      buf += 8;
    } else {
      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD in the
      // presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(nullptr, utf8_output);
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(nullptr, utf8_output);
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  // check for invalid input
  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
  if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
          _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
    return std::make_pair(nullptr, utf8_output);
  }

  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
    return std::make_pair(nullptr, utf8_output);
  }

  return std::make_pair(buf, utf8_output);
}

std::pair<result, char *>
sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
                                      char *utf8_output) {
  const char32_t *end = buf + len;
  const char32_t *start = buf;

  const __m128i v_0000 = _mm_setzero_si128();
  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
    // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
    __m128i in = _mm_loadu_si128((__m128i *)buf);
    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
    // Check for too large input
    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
    if (static_cast<uint16_t>(_mm_movemask_epi8(
            _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                            utf8_output);
    }

    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
    // saturation
    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
                                     _mm_and_si128(nextin, v_7fffffff));

    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp

    // Check for ASCII fast path
    if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
      // 1. pack the bytes
      // obviously suboptimal.
      const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
      // 2. store (16 bytes)
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
      // 3. adjust pointers
      buf += 8;
      utf8_output += 8;
      continue;
    }

    // no bits set above 7th bit
    const __m128i one_byte_bytemask =
        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
    const uint16_t one_byte_bitmask =
        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));

    // no bits set above 11th bit
    const __m128i one_or_two_bytes_bytemask =
        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
    const uint16_t one_or_two_bytes_bitmask =
        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));

    if (one_or_two_bytes_bitmask == 0xffff) {
      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
      // produces 2 bytes)
      // 1. prepare 2-byte values
      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
      // expected output   : [110a|aaaa|10bb|bbbb] x 8
      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);

      // t0 = [000a|aaaa|bbbb|bb00]
      const __m128i t0 = _mm_slli_epi16(in_16, 2);
      // t1 = [000a|aaaa|0000|0000]
      const __m128i t1 = _mm_and_si128(t0, v_1f00);
      // t2 = [0000|0000|00bb|bbbb]
      const __m128i t2 = _mm_and_si128(in_16, v_003f);
      // t3 = [000a|aaaa|00bb|bbbb]
      const __m128i t3 = _mm_or_si128(t1, t2);
      // t4 = [110a|aaaa|10bb|bbbb]
      const __m128i t4 = _mm_or_si128(t3, v_c080);

      // 2. merge ASCII and 2-byte codewords
      const __m128i utf8_unpacked =
          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);

      // 3. prepare bitmask for 8-bit lookup
      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
      //    MSB, a - LSB)
      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
      const uint16_t m1 =
          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
      const uint8_t m2 =
          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
      // 4. pack the bytes
      const uint8_t *row =
          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);

      // 5. store bytes
      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);

      // 6. adjust pointers
      buf += 8;
      utf8_output += row[0];
      continue;
    }

    // Check for overflow in packing
    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
    const uint32_t saturation_bitmask =
        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));

    if (saturation_bitmask == 0xffff) {
      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes

      // Check for illegal surrogate code units
      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
      const __m128i forbidden_bytemask =
          _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
        return std::make_pair(result(error_code::SURROGATE, buf - start),
                              utf8_output);
      }

      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);

      /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
        single UFT-8 byte
          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
        two UTF-8 bytes
          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
        three UTF-8 bytes

        We expand the input word (16-bit) into two code units (32-bit), thus
        we have room for four bytes. However, we need five distinct bit
        layouts. Note that the last byte in cases #2 and #3 is the same.

        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
        in register t2.

        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
        either byte 1 for case #2 or byte 2 for case #3. Note that they
        differ by exactly one bit.

        Finally from these two code units we build proper UTF-8 sequence, taking
        into account the case (i.e, the number of bytes to write).
      */
      /**
       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
       * t2 => [0ccc|cccc] [10cc|cccc]
       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
       */
#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));

      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
      const __m128i s0 = _mm_srli_epi16(in_16, 4);
      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
                                          simdutf_vec(0b0100000000000000));
      const __m128i s4 = _mm_xor_si128(s3, m0);
#undef simdutf_vec

      // 4. expand code units 16-bit => 32-bit
      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);

      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
      const uint16_t mask =
          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
      if (mask == 0) {
        // We only have three-byte code units. Use fast path.
        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
                                              15, 13, -1, -1, -1, -1);
        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
        utf8_output += 12;
        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
        utf8_output += 12;
        buf += 8;
        continue;
      }
      const uint8_t mask0 = uint8_t(mask);

      const uint8_t *row0 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);

      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);

      const uint8_t *row1 =
          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);

      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
      utf8_output += row0[0];
      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
      utf8_output += row1[0];

      buf += 8;
    } else {
      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
      // wasteful to use scalar code, but being efficient with SIMD in the
      // presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(
                result(error_code::SURROGATE, buf - start + k), utf8_output);
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(
                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while
  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
}
/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */

/* begin file src/westmere/sse_base64.cpp */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */

// --- encoding ----------------------------------------------------
template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
  // credit: Wojciech Muła
  // reduce  0..51 -> 0
  //        52..61 -> 1 .. 10
  //            62 -> 11
  //            63 -> 12
  __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));

  // distinguish between ranges 0..25 and 26..51:
  //         0 .. 25 -> remains 0
  //        26 .. 51 -> becomes 13
  const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
  result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));

  __m128i shift_LUT;
  if (base64_url) {
    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
  } else {
    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
  }

  // read shift
  result = _mm_shuffle_epi8(shift_LUT, result);

  return _mm_add_epi8(result, input);
}

inline __m128i insert_line_feed16(__m128i input, size_t K) {
  static const uint8_t shuffle_masks[16][16] = {
      {0x80, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 0x80, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 0x80, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 0x80, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 0x80, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 0x80, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 0x80, 6, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 0x80, 7, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 0x80, 8, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 0x80, 9, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x80, 10, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0x80, 11, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0x80, 12, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0x80, 13, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0x80, 14},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0x80}};
  // Prepare a vector with '\n' (0x0A)
  __m128i line_feed_vector = _mm_set1_epi8('\n');

  // Load the precomputed shuffle mask for K (index K-1)
  __m128i mask = _mm_loadu_si128((__m128i *)shuffle_masks[K]);
  __m128i lf_pos = _mm_cmpeq_epi8(mask, _mm_set1_epi8(static_cast<char>(0x80)));

  // Perform the shuffle to reposition the K bytes
  __m128i shuffled = _mm_shuffle_epi8(input, mask);

  // Blend with line_feed_vector to insert '\n' at the appropriate positions
  __m128i result = _mm_blendv_epi8(shuffled, line_feed_vector, lf_pos);

  return result;
}
template <bool isbase64url, bool use_lines>
size_t encode_base64_impl(char *dst, const char *src, size_t srclen,
                          base64_options options,
                          size_t line_length = simdutf::default_line_length) {
  size_t offset = 0;
  if (line_length < 4) {
    line_length = 4; // We do not support line_length less than 4
  }
  // credit: Wojciech Muła
  // SSE (lookup: pshufb improved unrolled)
  const uint8_t *input = (const uint8_t *)src;

  uint8_t *out = (uint8_t *)dst;
  const __m128i shuf =
      _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);

  size_t i = 0;
  for (; i + 52 <= srclen; i += 48) {
    __m128i in0 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
    __m128i in1 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
    __m128i in2 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
    __m128i in3 = _mm_loadu_si128(
        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));

    in0 = _mm_shuffle_epi8(in0, shuf);
    in1 = _mm_shuffle_epi8(in1, shuf);
    in2 = _mm_shuffle_epi8(in2, shuf);
    in3 = _mm_shuffle_epi8(in3, shuf);

    const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
    const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
    const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
    const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));

    const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
    const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
    const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
    const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));

    const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
    const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
    const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
    const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));

    const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
    const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
    const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
    const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));

    const __m128i input0 = _mm_or_si128(t1_0, t3_0);
    const __m128i input1 = _mm_or_si128(t1_1, t3_1);
    const __m128i input2 = _mm_or_si128(t1_2, t3_2);
    const __m128i input3 = _mm_or_si128(t1_3, t3_3);

    const __m128i t0 = lookup_pshufb_improved<isbase64url>(input0);
    const __m128i t1 = lookup_pshufb_improved<isbase64url>(input1);
    const __m128i t2 = lookup_pshufb_improved<isbase64url>(input2);
    const __m128i t3 = lookup_pshufb_improved<isbase64url>(input3);

    if (use_lines) {
      if (line_length >= 64) { // fast path
        if (offset + 64 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 64 - location_end;
          if (location_end < 16) {
            // We can store or extract store. See below.
            //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t0);
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
                             insert_line_feed16(t0, location_end));
            out[16] = static_cast<uint8_t>(_mm_extract_epi8(t0, 15));
            out += 17;
          } else {
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0);
            out += 16;
          }
          if (location_end >= 16 && location_end < 32) {
            // We can store or extract store. See below.
            //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t1);
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
                             insert_line_feed16(t1, location_end - 16));
            out[16] = static_cast<uint8_t>(_mm_extract_epi8(t1, 15));
            out += 17;
          } else {
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t1);
            out += 16;
          }
          if (location_end >= 32 && location_end < 48) {
            // We can store or extract store. See below.
            //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t2);
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
                             insert_line_feed16(t2, location_end - 32));
            out[16] = static_cast<uint8_t>(_mm_extract_epi8(t2, 15));
            out += 17;
          } else {
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t2);
            out += 16;
          }
          if (location_end >= 48) {
            // We can store or extract store. See below.
            //_mm_storeu_si128(reinterpret_cast<__m128i *>(out+1), t3);
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
                             insert_line_feed16(t3, location_end - 48));
            out[16] = static_cast<uint8_t>(_mm_extract_epi8(t3, 15));
            out += 17;
          } else {
            _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t3);
            out += 16;
          }
          offset = to_move;
        } else {

          _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0);
          _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), t1);
          _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), t2);
          _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), t3);
          offset += 64;
          out += 64;
        }
      } else { // slow path
        // could be optimized
        alignas(64) uint8_t buffer[64];
        _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), t0);
        _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 16), t1);
        _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 32), t2);
        _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + 48), t3);
        std::memcpy(out, buffer, 64);
        size_t out_pos = 0;
        size_t local_offset = offset;
        for (size_t j = 0; j < 64;) {
          if (local_offset == line_length) {
            out[out_pos++] = '\n';
            local_offset = 0;
          }
          out[out_pos++] = buffer[j++];
          local_offset++;
        }
        offset = local_offset;
        out += out_pos;
      }
    } else {
      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), t0);
      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), t1);
      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), t2);
      _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), t3);
      out += 64;
    }
  }
  for (; i + 16 <= srclen; i += 12) {

    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));

    // bytes from groups A, B and C are needed in separate 32-bit lanes
    // in = [DDDD|CCCC|BBBB|AAAA]
    //
    //      an input triplet has layout
    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
    //        triplet
    //
    //      shuffling changes the order of bytes: 1, 0, 2, 1
    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
    //                  processed bits
    in = _mm_shuffle_epi8(in, shuf);

    // unpacking

    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
    const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
    //          (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
    //          multiplication)
    const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));

    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
    const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
    // t3    = [00dddddd|00000000|00bbbbbb|00000000](
    //          (d * (1 << 8), b * (1 << 4))
    const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));

    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
    const __m128i indices = _mm_or_si128(t1, t3);

    const __m128i T0 = lookup_pshufb_improved<isbase64url>(indices);

    _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0);

    if (use_lines) {
      if (line_length >= 16) { // fast path
        _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0);
        if (offset + 16 > line_length) {
          size_t location_end = line_length - offset;
          size_t to_move = 16 - location_end;
          std::memmove(out + location_end + 1, out + location_end, to_move);
          out[location_end] = '\n';
          offset = to_move;
          out += 16 + 1;
        } else {
          offset += 16;
          out += 16;
        }
      } else { // slow path
        // could be optimized
        uint8_t buffer[16];
        _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer), T0);
        size_t out_pos = 0;
        size_t local_offset = offset;
        for (size_t j = 0; j < 16;) {
          if (local_offset == line_length) {
            out[out_pos++] = '\n';
            local_offset = 0;
          }
          out[out_pos++] = buffer[j++];
          local_offset++;
        }
        offset = local_offset;
        out += out_pos;
      }
    } else {
      _mm_storeu_si128(reinterpret_cast<__m128i *>(out), T0);
      out += 16;
    }
  }
  return ((char *)out - (char *)dst) +
         scalar::base64::tail_encode_base64_impl<use_lines>(
             (char *)out, src + i, srclen - i, options, line_length, offset);
}

template <bool isbase64url>
size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {
  return encode_base64_impl<isbase64url, false>(dst, src, srclen, options);
}

// --- decoding -----------------------------------------------

static simdutf_really_inline void compress(__m128i data, uint16_t mask,
                                           char *output) {
  if (mask == 0) {
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
    return;
  }

  // this particular implementation was inspired by work done by @animetosho
  // we do it in two steps, first 8 bytes and then second 8 bytes
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
  // next line just loads the 64-bit values thintable_epi8[mask1] and
  // thintable_epi8[mask2] into a 128-bit register, using only
  // two instructions on most compilers.

  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
                                    tables::base64::thintable_epi8[mask1]);
  // we increment by 0x08 the second half of the mask
  shufmask =
      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
  // this is the version "nearly pruned"
  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
  // we still need to put the two halves together.
  // we compute the popcount of the first half:
  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
  // then load the corresponding mask, what it does is to write
  // only the first pop1 bytes from the first 8 bytes, and then
  // it fills in with the bytes from the second 8 bytes + some filling
  // at the end.
  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
      tables::base64::pshufb_combine_table + pop1 * 8));
  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
}

static simdutf_really_inline void base64_decode(char *out, __m128i str) {
  // credit: aqrit

  const __m128i pack_shuffle =
      _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);

  const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
  const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
  const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
  // Store the output:
  // this writes 16 bytes, but we only need 12.
  _mm_storeu_si128((__m128i *)out, t2);
}

// decode 64 bytes and output 48 bytes
static inline void base64_decode_block(char *out, const char *src) {
  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
  base64_decode(out + 12,
                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
  base64_decode(out + 24,
                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
  base64_decode(out + 36,
                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
}

static inline void base64_decode_block_safe(char *out, const char *src) {
  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
  base64_decode(out + 12,
                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
  base64_decode(out + 24,
                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
  char buffer[16];
  base64_decode(buffer,
                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
  std::memcpy(out + 36, buffer, 12);
}

// --- decoding - base64 class --------------------------------

class block64 {
  __m128i chunks[4];

public:
  // The caller of this function is responsible to ensure that there are 64
  // bytes available from reading at src.
  simdutf_really_inline block64(const char *src) {
    chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
    chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
    chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
    chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
  }

public:
  // The caller of this function is responsible to ensure that there are 128
  // bytes available from reading at src. The data is read into a block64
  // structure.
  simdutf_really_inline block64(const char16_t *src) {
    const auto m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
    const auto m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
    const auto m3 =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
    const auto m4 =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
    const auto m5 =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
    const auto m6 =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
    const auto m7 =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
    const auto m8 =
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
    chunks[0] = _mm_packus_epi16(m1, m2);
    chunks[1] = _mm_packus_epi16(m3, m4);
    chunks[2] = _mm_packus_epi16(m5, m6);
    chunks[3] = _mm_packus_epi16(m7, m8);
  }

public:
  simdutf_really_inline void copy_block(char *output) {
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), chunks[0]);
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), chunks[1]);
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), chunks[2]);
    _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), chunks[3]);
  }

public:
  simdutf_really_inline uint64_t compress_block(uint64_t mask, char *output) {
    if (is_power_of_two(mask)) {
      return compress_block_single(mask, output);
    }

    uint64_t nmask = ~mask;
    compress(chunks[0], uint16_t(mask), output);
    compress(chunks[1], uint16_t(mask >> 16),
             output + count_ones(nmask & 0xFFFF));
    compress(chunks[2], uint16_t(mask >> 32),
             output + count_ones(nmask & 0xFFFFFFFF));
    compress(chunks[3], uint16_t(mask >> 48),
             output + count_ones(nmask & 0xFFFFFFFFFFFFULL));
    return count_ones(nmask);
  }

private:
  simdutf_really_inline size_t compress_block_single(uint64_t mask,
                                                     char *output) {
    const size_t pos64 = trailing_zeroes(mask);
    const int8_t pos = pos64 & 0xf;
    switch (pos64 >> 4) {
    case 0b00: {
      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(chunks[0], sh);

      _mm_storeu_si128((__m128i *)(output + 0 * 16), compressed);
      _mm_storeu_si128((__m128i *)(output + 1 * 16 - 1), chunks[1]);
      _mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]);
      _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]);
    } break;
    case 0b01: {
      _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(chunks[1], sh);

      _mm_storeu_si128((__m128i *)(output + 1 * 16), compressed);
      _mm_storeu_si128((__m128i *)(output + 2 * 16 - 1), chunks[2]);
      _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]);
    } break;
    case 0b10: {
      _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]);
      _mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(chunks[2], sh);

      _mm_storeu_si128((__m128i *)(output + 2 * 16), compressed);
      _mm_storeu_si128((__m128i *)(output + 3 * 16 - 1), chunks[3]);
    } break;
    case 0b11: {
      _mm_storeu_si128((__m128i *)(output + 0 * 16), chunks[0]);
      _mm_storeu_si128((__m128i *)(output + 1 * 16), chunks[1]);
      _mm_storeu_si128((__m128i *)(output + 2 * 16), chunks[2]);

      const __m128i v0 = _mm_set1_epi8(char(pos - 1));
      const __m128i v1 =
          _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      const __m128i v2 = _mm_cmpgt_epi8(v1, v0);
      const __m128i sh = _mm_sub_epi8(v1, v2);
      const __m128i compressed = _mm_shuffle_epi8(chunks[3], sh);

      _mm_storeu_si128((__m128i *)(output + 3 * 16), compressed);
    } break;
    }

    return 63;
  }

public:
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
  simdutf_really_inline uint64_t to_base64_mask(uint64_t *error) {
    uint32_t err0 = 0;
    uint32_t err1 = 0;
    uint32_t err2 = 0;
    uint32_t err3 = 0;
    uint64_t m0 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        &chunks[0], &err0);
    uint64_t m1 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        &chunks[1], &err1);
    uint64_t m2 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        &chunks[2], &err2);
    uint64_t m3 = to_base64_mask<base64_url, ignore_garbage, default_or_url>(
        &chunks[3], &err3);
    if (!ignore_garbage) {
      *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
               ((uint64_t)err3 << 48);
    }
    return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
  }

private:
  template <bool base64_url, bool ignore_garbage, bool default_or_url>
  simdutf_really_inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
    const __m128i ascii_space_tbl =
        _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
                      0x0, 0xc, 0xd, 0x0, 0x0);
    // credit: aqrit
    __m128i delta_asso;
    if (default_or_url) {
      delta_asso =
          _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00,
                        0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16);
    } else if (base64_url) {
      delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
                                 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
    } else {
      delta_asso =
          _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00,
                        0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
    }
    __m128i delta_values;
    if (default_or_url) {
      delta_values = _mm_setr_epi8(
          uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0x13),
          uint8_t(0x04), uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
          uint8_t(0xB9), uint8_t(0x00), uint8_t(0xFF), uint8_t(0x11),
          uint8_t(0xFF), uint8_t(0xBF), uint8_t(0x10), uint8_t(0xB9));

    } else if (base64_url) {
      delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
                                   uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
                                   0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
                                   uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
    } else {
      delta_values =
          _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
                        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
                        int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
                        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
    }
    __m128i check_asso;
    if (default_or_url) {
      check_asso =
          _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                        0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06);
    } else if (base64_url) {
      check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
                                 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
    } else {
      check_asso =
          _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                        0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
    }
    __m128i check_values;
    if (default_or_url) {
      check_values = _mm_setr_epi8(
          uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
          uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
          uint8_t(0xB5), uint8_t(0xA1), uint8_t(0x00), uint8_t(0x80),
          uint8_t(0x00), uint8_t(0x80), uint8_t(0x00), uint8_t(0x80));
    } else if (base64_url) {
      check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
                                   uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
                                   uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
                                   uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
                                   uint8_t(0x80), 0x0, uint8_t(0x80));
    } else {
      check_values =
          _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
                        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
                        int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
                        int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
    }
    const __m128i shifted = _mm_srli_epi32(*src, 3);

    __m128i delta_hash =
        _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
    if (default_or_url) {
      delta_hash = _mm_and_si128(delta_hash, _mm_set1_epi8(0xf));
    }
    const __m128i check_hash =
        _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);

    const __m128i out =
        _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
    const __m128i chk =
        _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
    const int mask = _mm_movemask_epi8(chk);
    if (!ignore_garbage && mask) {
      __m128i ascii_space =
          _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
      *error = (mask ^ _mm_movemask_epi8(ascii_space));
    }
    *src = out;
    return (uint16_t)mask;
  }

public:
  simdutf_really_inline void base64_decode_block(char *out) {
    base64_decode(out, chunks[0]);
    base64_decode(out + 12, chunks[1]);
    base64_decode(out + 24, chunks[2]);
    base64_decode(out + 36, chunks[3]);
  }

public:
  simdutf_really_inline void base64_decode_block_safe(char *out) {
    base64_decode(out, chunks[0]);
    base64_decode(out + 12, chunks[1]);
    base64_decode(out + 24, chunks[2]);
    char buffer[16];
    base64_decode(buffer, chunks[3]);
    std::memcpy(out + 36, buffer, 12);
  }
};
/* end file src/westmere/sse_base64.cpp */

} // unnamed namespace
} // namespace westmere
} // namespace simdutf

/* begin file src/generic/buf_block_reader.h */
namespace simdutf {
namespace westmere {
namespace {

// Walks through a buffer in block-sized increments, loading the last part with
// spaces
template <size_t STEP_SIZE> struct buf_block_reader {
public:
  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
  simdutf_really_inline size_t block_index();
  simdutf_really_inline bool has_full_block() const;
  simdutf_really_inline const uint8_t *full_block() const;
  /**
   * Get the last block, padded with spaces.
   *
   * There will always be a last block, with at least 1 byte, unless len == 0
   * (in which case this function fills the buffer with spaces and returns 0. In
   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
   * block with STEP_SIZE bytes and no spaces for padding.
   *
   * @return the number of effective characters in the last block.
   */
  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
  simdutf_really_inline void advance();

private:
  const uint8_t *buf;
  const size_t len;
  const size_t lenminusstep;
  size_t idx;
};

template <size_t STEP_SIZE>
simdutf_really_inline
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
      idx{0} {}

template <size_t STEP_SIZE>
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
  return idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
  return idx < lenminusstep;
}

template <size_t STEP_SIZE>
simdutf_really_inline const uint8_t *
buf_block_reader<STEP_SIZE>::full_block() const {
  return &buf[idx];
}

template <size_t STEP_SIZE>
simdutf_really_inline size_t
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
  if (len == idx) {
    return 0;
  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
  std::memset(dst, 0x20,
              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
                          // to write out 8 or 16 bytes at once.
  std::memcpy(dst, buf + idx, len - idx);
  return len - idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
  idx += STEP_SIZE;
}

} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/buf_block_reader.h */
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8_validation {

using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  static const uint8_t max_array[32] = {255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        0b11110000u - 1,
                                        0b11100000u - 1,
                                        0b11000000u - 1};
  const simd8<uint8_t> max_value(
      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
  return input.gt_bits(max_value);
}

struct utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
  // The last input we received
  simd8<uint8_t> prev_input_block;
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  simd8<uint8_t> prev_incomplete;

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error |= this->prev_incomplete;
  }

  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
    if (simdutf_likely(is_ascii(input))) {
      this->error |= this->prev_incomplete;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
                    "We support either two or four chunks per 64-byte block.");
      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
      }
      this->prev_incomplete =
          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
    }
  }

  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_validation

using utf8_validation::utf8_checker;

} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
/* begin file src/generic/utf8_validation/utf8_validator.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8_validation {

/**
 * Validates that the string is actual UTF-8.
 */
template <class checker>
bool generic_validate_utf8(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  return !c.errors();
}

bool generic_validate_utf8(const char *input, size_t length) {
  return generic_validate_utf8<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

/**
 * Validates that the string is actual UTF-8 and stops on errors.
 */
template <class checker>
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    if (c.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(input),
          reinterpret_cast<const char *>(input + count), length - count);
      res.count += count;
      return res;
    }
    reader.advance();
    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  if (c.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(input),
        reinterpret_cast<const char *>(input) + count, length - count);
    res.count += count;
    return res;
  } else {
    return result(error_code::SUCCESS, length);
  }
}

result generic_validate_utf8_with_errors(const char *input, size_t length) {
  return generic_validate_utf8_with_errors<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

} // namespace utf8_validation
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_validator.h */
/* begin file src/generic/ascii_validation.h */
namespace simdutf {
namespace westmere {
namespace {
namespace ascii_validation {

result generic_validate_ascii_with_errors(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      result res = scalar::ascii::validate_with_errors(
          reinterpret_cast<const char *>(input + count), length - count);
      return result(res.error, count + res.count);
    }
    reader.advance();

    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  if (!in.is_ascii()) {
    result res = scalar::ascii::validate_with_errors(
        reinterpret_cast<const char *>(input + count), length - count);
    return result(res.error, count + res.count);
  } else {
    return result(error_code::SUCCESS, length);
  }
}

bool generic_validate_ascii(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      return false;
    }
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  return in.is_ascii();
}

} // namespace ascii_validation
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/ascii_validation.h */

/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8_to_utf32 {

using namespace simd;

simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char32_t *utf32_output) noexcept {
  size_t pos = 0;
  char32_t *start{utf32_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf32(utf32_output);
      utf32_output += 64;
      pos += 64;
    } else {
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      size_t max_starting_point = (pos + 64) - 12;
      while (pos < max_starting_point) {
        size_t consumed = convert_masked_utf8_to_utf32(
            input + pos, utf8_end_of_code_point_mask, utf32_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
    }
  }
  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
                                                       utf32_output);
  return utf32_output - start;
}

} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8_to_utf32 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (utf8_continuation_mask & 1) {
          return 0; // we have an error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
      if (howmany == 0) {
        return 0;
      }
      utf32_output += howmany;
    }
    return utf32_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (errors() || (utf8_continuation_mask & 1)) {
          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, utf32_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        utf32_output += res.count;
      }
    }
    return result(error_code::SUCCESS, utf32_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
/* begin file src/generic/utf32.h */
#include <limits>

namespace simdutf {
namespace westmere {
namespace {
namespace utf32 {

template <typename T> T min(T a, T b) { return a <= b ? a : b; }

simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
                                                    size_t length) {
  using vector_u32 = simd32<uint32_t>;

  const char32_t *start = input;

  // we add up to three ones in a single iteration (see the vectorized loop in
  // section #2 below)
  const size_t max_increment = 3;

  const size_t N = vector_u32::ELEMENTS;

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
  const auto v_0000007f = vector_u32::splat(0x0000007f);
  const auto v_000007ff = vector_u32::splat(0x000007ff);
  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
#else
  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
  const auto v_fffff800 = vector_u32::splat(0xfffff800);
  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto one = vector_u32::splat(1);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

  size_t counter = 0;

  // 1. vectorized loop unrolled 4 times
  {
    // we use vector of uint32 counters, this is why this limit is used
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
    size_t blocks = length / (N * 4);
    length -= blocks * (N * 4);
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      simd32<uint32_t> acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in0 = vector_u32(input + 0 * N);
        const auto in1 = vector_u32(input + 1 * N);
        const auto in2 = vector_u32(input + 2 * N);
        const auto in3 = vector_u32(input + 3 * N);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in0 > v_0000007f);
        acc -= as_vector_u32(in1 > v_0000007f);
        acc -= as_vector_u32(in2 > v_0000007f);
        acc -= as_vector_u32(in3 > v_0000007f);

        acc -= as_vector_u32(in0 > v_000007ff);
        acc -= as_vector_u32(in1 > v_000007ff);
        acc -= as_vector_u32(in2 > v_000007ff);
        acc -= as_vector_u32(in3 > v_000007ff);

        acc -= as_vector_u32(in0 > v_0000ffff);
        acc -= as_vector_u32(in1 > v_0000ffff);
        acc -= as_vector_u32(in2 > v_0000ffff);
        acc -= as_vector_u32(in3 > v_0000ffff);
#else
        acc += min(one, in0 & v_ffffff80);
        acc += min(one, in1 & v_ffffff80);
        acc += min(one, in2 & v_ffffff80);
        acc += min(one, in3 & v_ffffff80);

        acc += min(one, in0 & v_fffff800);
        acc += min(one, in1 & v_fffff800);
        acc += min(one, in2 & v_fffff800);
        acc += min(one, in3 & v_fffff800);

        acc += min(one, in0 & v_ffff0000);
        acc += min(one, in1 & v_ffff0000);
        acc += min(one, in2 & v_ffff0000);
        acc += min(one, in3 & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += 4 * N;
      }

      counter += acc.sum();
    }
  }

  // 2. vectorized loop for tail
  {
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / max_increment;
    size_t blocks = length / N;
    length -= blocks * N;
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      auto acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in = vector_u32(input);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in > v_0000007f);
        acc -= as_vector_u32(in > v_000007ff);
        acc -= as_vector_u32(in > v_0000ffff);
#else
        acc += min(one, in & v_ffffff80);
        acc += min(one, in & v_fffff800);
        acc += min(one, in & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += N;
      }

      counter += acc.sum();
    }
  }

  const size_t consumed = input - start;
  if (consumed != 0) {
    // We don't count 0th bytes in the vectorized loops above, this
    // is why we need to count them in the end.
    counter += consumed;
  }

  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
}

} // namespace utf32
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf32.h */

/* begin file src/generic/utf8.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8 {

using namespace simd;

simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.gt(-65);
    count += count_ones(utf8_continuation_mask);
  }
  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
                                                        size_t size) {
  using vector_i8 = simd8<int8_t>;
  using vector_u8 = simd8<uint8_t>;
  using vector_u64 = simd64<uint64_t>;

  constexpr size_t N = vector_i8::SIZE;
  constexpr size_t max_iterations = 255 / 4;

  size_t pos = 0;
  size_t count = 0;

  auto counters = vector_u64::zero();
  auto local = vector_u8::zero();
  size_t iterations = 0;
  for (; pos + 4 * N <= size; pos += 4 * N) {
    const auto input0 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
    const auto input1 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
    const auto input2 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
    const auto input3 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
    const auto mask0 = input0 > int8_t(-65);
    const auto mask1 = input1 > int8_t(-65);
    const auto mask2 = input2 > int8_t(-65);
    const auto mask3 = input3 > int8_t(-65);

    local -= vector_u8(mask0);
    local -= vector_u8(mask1);
    local -= vector_u8(mask2);
    local -= vector_u8(mask3);

    iterations += 1;
    if (iterations == max_iterations) {
      counters += sum_8bytes(local);
      local = vector_u8::zero();
      iterations = 0;
    }
  }

  if (iterations > 0) {
    count += local.sum_bytes();
  }

  count += counters.sum();

  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
    // We count one word for anything that is not a continuation (so
    // leading bytes).
    count += 64 - count_ones(utf8_continuation_mask);
    int64_t utf8_4byte = input.gteq_unsigned(240);
    count += count_ones(utf8_4byte);
  }
  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
}

} // namespace utf8
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf8.h */
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
  // 0b11000010 and nothing else.
  //
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
  constexpr const uint8_t FORBIDDEN = 0xff;

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      FORBIDDEN,
      // 1110____ ________ <three byte lead in byte 1>
      FORBIDDEN,
      // 1111____ ________ <four+ byte lead in byte 1>
      FORBIDDEN);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              FORBIDDEN,
              // ____0101 ________
              FORBIDDEN,
              // ____011_ ________
              FORBIDDEN, FORBIDDEN,

              // ____1___ ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
              // ____1101 ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    this->error |= check_special_cases(input, prev1);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 16; margin--) {
      leading_byte += (int8_t(in[margin - 1]) >
                       -65); // twos complement of -65 is 1011 1111 ...
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask =
            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                               // this case, we also have ASCII to account for.
        if (utf8_continuation_mask & 1) {
          return 0; // error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
      if (howmany == 0) {
        return 0;
      }
      latin1_output += howmany;
    }
    return latin1_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        if (errors()) {
          // rewind_and_convert_with_errors will seek a potential error from
          // in+pos onward, with the ability to go back up to pos bytes, and
          // read size-pos bytes forward.
          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, latin1_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        latin1_output += res.count;
      }
    }
    return result(error_code::SUCCESS, latin1_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_latin1
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace westmere
} // namespace simdutf
  // namespace simdutf
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */

/* begin file src/generic/validate_utf32.h */
namespace simdutf {
namespace westmere {
namespace {
namespace utf32 {

simdutf_really_inline bool validate(const char32_t *input, size_t size) {
  if (simdutf_unlikely(size == 0)) {
    // empty input is valid UTF-32. protect the implementation from
    // handling nullptr
    return true;
  }

  const char32_t *end = input + size;

  using vector_u32 = simd32<uint32_t>;

  const auto standardmax = vector_u32::splat(0x10ffff);
  const auto offset = vector_u32::splat(0xffff2000);
  const auto standardoffsetmax = vector_u32::splat(0xfffff7ff);
  auto currentmax = vector_u32::zero();
  auto currentoffsetmax = vector_u32::zero();

  constexpr size_t N = vector_u32::ELEMENTS;

  while (input + N < end) {
    auto in = vector_u32(input);
    if constexpr (!match_system(endianness::BIG)) {
      in.swap_bytes();
    }

    currentmax = max(currentmax, in);
    currentoffsetmax = max(currentoffsetmax, in + offset);
    input += N;
  }

  const auto too_large = currentmax > standardmax;
  if (too_large.any()) {
    return false;
  }

  const auto surrogate = currentoffsetmax > standardoffsetmax;
  if (surrogate.any()) {
    return false;
  }

  return scalar::utf32::validate(input, end - input);
}

simdutf_really_inline result validate_with_errors(const char32_t *input,
                                                  size_t size) {
  if (simdutf_unlikely(size == 0)) {
    // empty input is valid UTF-32. protect the implementation from
    // handling nullptr
    return result(error_code::SUCCESS, 0);
  }

  const char32_t *start = input;
  const char32_t *end = input + size;

  using vector_u32 = simd32<uint32_t>;

  const auto standardmax = vector_u32::splat(0x10ffff + 1);
  const auto surrogate_mask = vector_u32::splat(0xfffff800);
  const auto surrogate_byte = vector_u32::splat(0x0000d800);

  constexpr size_t N = vector_u32::ELEMENTS;

  while (input + N < end) {
    auto in = vector_u32(input);
    if constexpr (!match_system(endianness::BIG)) {
      in.swap_bytes();
    }

    const auto too_large = in >= standardmax;
    const auto surrogate = (in & surrogate_mask) == surrogate_byte;

    const auto combined = too_large | surrogate;
    if (simdutf_unlikely(combined.any())) {
      const size_t consumed = input - start;
      auto sr = scalar::utf32::validate_with_errors(input, end - input);
      sr.count += consumed;

      return sr;
    }

    input += N;
  }

  const size_t consumed = input - start;
  auto sr = scalar::utf32::validate_with_errors(input, end - input);
  sr.count += consumed;

  return sr;
}

} // namespace utf32
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/validate_utf32.h */

/* begin file src/generic/base64.h */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */
namespace simdutf {
namespace westmere {
namespace {
namespace base64 {

/*
    The following template function implements API for Base64 decoding.

    An implementation is responsible for providing the `block64` type and
    associated methods that perform actual conversion. Please refer
    to any vectorized implementation to learn the API of these procedures.
*/
template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename chartype>
full_result
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t equalsigns = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && equalsigns > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0, true};
    }
    return {SUCCESS, full_input_length, 0};
  }
  char *end_of_safe_64byte_zone =
      dst == nullptr
          ? nullptr
          : ((srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63
                                        : dst);

  const chartype *const srcinit = src;
  const char *const dstinit = dst;
  const chartype *const srcend = src + srclen;

  constexpr size_t block_size = 6;
  static_assert(block_size >= 2, "block_size must be at least two");
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const chartype *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b(src);
      src += 64;
      uint64_t error = 0;
      const uint64_t badcharmask =
          b.to_base64_mask<base64_url, ignore_garbage, default_or_url>(&error);
      if (!ignore_garbage && error) {
        src -= 64;
        const size_t error_offset = trailing_zeroes(error);
        return {error_code::INVALID_BASE64_CHARACTER,
                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
      }
      if (badcharmask != 0) {
        bufferptr += b.compress_block(badcharmask, bufferptr);
      } else if (bufferptr != buffer) {
        b.copy_block(bufferptr);
        bufferptr += 64;
      } else {
        if (dst >= end_of_safe_64byte_zone) {
          b.base64_decode_block_safe(dst);
        } else {
          b.base64_decode_block(dst);
        }
        dst += 48;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 2); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        if (dst >= end_of_safe_64byte_zone) {
          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
        } else {
          base64_decode_block(dst, buffer + (block_size - 2) * 64);
        }
        dst += 48;
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }

  char *buffer_start = buffer;
  // Optimization note: if this is almost full, then it is worth our
  // time, otherwise, we should just decode directly.
  int last_block = (int)((bufferptr - buffer_start) % 64);
  if (last_block != 0 && srcend - src + last_block >= 64) {

    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
      uint8_t val = to_base64[uint8_t(*src)];
      *bufferptr = char(val);
      if (!ignore_garbage &&
          (!scalar::base64::is_eight_byte(*src) || val > 64)) {
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      bufferptr += (val <= 63);
      src++;
    }
  }

  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    if (dst >= end_of_safe_64byte_zone) {
      base64_decode_block_safe(dst, buffer_start);
    } else {
      base64_decode_block(dst, buffer_start);
    }
    dst += 48;
  }
  if ((bufferptr - buffer_start) % 64 != 0) {
    while (buffer_start + 4 < bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    if (buffer_start + 4 <= bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
#if !SIMDUTF_IS_BIG_ENDIAN
      triple = scalar::u32_swap_bytes(triple);
#endif
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
    // backtrack
    int leftover = int(bufferptr - buffer_start);
    while (leftover > 0) {
      if (!ignore_garbage) {
        while (to_base64[uint8_t(*(src - 1))] == 64) {
          src--;
        }
      } else {
        while (to_base64[uint8_t(*(src - 1))] >= 64) {
          src--;
        }
      }
      src--;
      leftover--;
    }
  }
  if (src < srcend + equalsigns) {
    full_result r = scalar::base64::base64_tail_decode(
        dst, src, srcend - src, equalsigns, options, last_chunk_options);
    r = scalar::base64::patch_tail_result(
        r, size_t(src - srcinit), size_t(dst - dstinit), equallocation,
        full_input_length, last_chunk_options);
    // When is_partial(last_chunk_options) is true, we must either end with
    // the end of the stream (beyond whitespace) or right after a non-ignorable
    // character or at the very beginning of the stream.
    // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
        r.input_count < full_input_length) {
      // First check if we can extend the input to the end of the stream
      while (r.input_count < full_input_length &&
             base64_ignorable(*(srcinit + r.input_count), options)) {
        r.input_count++;
      }
      // If we are still not at the end of the stream, then we must backtrack
      // to the last non-ignorable character.
      if (r.input_count < full_input_length) {
        while (r.input_count > 0 &&
               base64_ignorable(*(srcinit + r.input_count - 1), options)) {
          r.input_count--;
        }
      }
    }
    return r;
  }
  if (!ignore_garbage && equalsigns > 0) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit),
              true};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}

} // namespace base64
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/base64.h */
/* begin file src/generic/find.h */
namespace simdutf {
namespace westmere {
namespace {
namespace util {

simdutf_really_inline const char *find(const char *start, const char *end,
                                       char character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  // Align the start pointer to 64 bytes
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % 64;
  if (misalignment != 0) {
    size_t adjustment = 64 - misalignment;
    if (size_t(std::distance(start, end)) < adjustment) {
      adjustment = std::distance(start, end);
    }
    for (size_t i = 0; i < adjustment; i++) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for 64-byte aligned data
  for (; std::distance(start, end) >= 64; start += 64) {
    simd8x64<uint8_t> input(reinterpret_cast<const uint8_t *>(start));
    uint64_t matches = input.eq(uint8_t(character));
    if (matches != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(matches);
      return start + index;
    }
  }
  return std::find(start, end, character);
}

simdutf_really_inline const char16_t *
find(const char16_t *start, const char16_t *end, char16_t character) noexcept {
  // Handle empty or invalid range
  if (start >= end)
    return end;
  // Align the start pointer to 64 bytes if misalignment is even
  uintptr_t misalignment = reinterpret_cast<uintptr_t>(start) % 64;
  if (misalignment != 0 && misalignment % 2 == 0) {
    size_t adjustment = (64 - misalignment) / sizeof(char16_t);
    if (size_t(std::distance(start, end)) < adjustment) {
      adjustment = std::distance(start, end);
    }
    for (size_t i = 0; i < adjustment; i++) {
      if (start[i] == character) {
        return start + i;
      }
    }
    start += adjustment;
  }

  // Main loop for 64-byte aligned data
  for (; std::distance(start, end) >= 32; start += 32) {
    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(start));
    uint64_t matches = input.eq(uint16_t(character));
    if (matches != 0) {
      // Found a match, return the first one
      int index = trailing_zeroes(matches) / 2;
      return start + index;
    }
  }
  return std::find(start, end, character);
}

} // namespace util
} // namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/find.h */
/* begin file src/generic/base64lengths.h */
namespace simdutf {
namespace westmere {
namespace {
namespace base64_lengths {

simdutf_warn_unused size_t binary_length_from_base64(const char *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= length; pos += 64) {
    simd8x64<uint8_t> block(reinterpret_cast<const uint8_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 32 <= length; pos += 32) {
    simd16x32<uint16_t> block(reinterpret_cast<const uint16_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char16_t c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

} // namespace base64_lengths
} // unnamed namespace
} // namespace westmere
} // namespace simdutf
/* end file src/generic/base64lengths.h */

//
// Implementation-specific overrides
//

namespace simdutf {
namespace westmere {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return westmere::utf8_validation::generic_validate_utf8(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return westmere::ascii_validation::generic_validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return westmere::ascii_validation::generic_validate_ascii_with_errors(buf,
                                                                        len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  return utf32::validate(buf, len);
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  return utf32::validate_with_errors(buf, len);
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {

  std::pair<const char *, char *> ret =
      sse_convert_latin1_to_utf8(buf, len, utf8_output);
  size_t converted_chars = ret.second - utf8_output;

  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }

  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  std::pair<const char *, char32_t *> ret =
      sse_convert_latin1_to_utf32(buf, len, utf32_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t converted_chars = ret.second - utf32_output;
  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_converted_chars == 0) {
      return 0;
    }
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      sse_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;
  // if (ret.first != buf + len) {
  if (ret.first < buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
                                                        latin1_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      latin1_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  // optimization opportunity: we could provide an optimized function.
  return convert_utf32_to_latin1(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      sse_convert_utf32_to_utf8(buf, len, utf8_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - utf8_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      utf8_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  return convert_utf32_to_utf8(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  return utf8::count_code_points_bytemask(input, length);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t len) const noexcept {
  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
  size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
  size_t i = 0;
  if (answer >= 2048) { // long strings optimization
    __m128i two_64bits = _mm_setzero_si128();
    while (i + sizeof(__m128i) <= len) {
      __m128i runner = _mm_setzero_si128();
      size_t iterations = (len - i) / sizeof(__m128i);
      if (iterations > 255) {
        iterations = 255;
      }
      size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
      for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
        __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
        __m128i input2 =
            _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
        __m128i input3 =
            _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
        __m128i input4 =
            _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
        __m128i input12 =
            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
                         _mm_cmpgt_epi8(_mm_setzero_si128(), input2));
        __m128i input34 =
            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
                         _mm_cmpgt_epi8(_mm_setzero_si128(), input4));
        __m128i input1234 = _mm_add_epi8(input12, input34);
        runner = _mm_sub_epi8(runner, input1234);
      }
      for (; i <= max_i; i += sizeof(__m128i)) {
        __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
        runner = _mm_sub_epi8(runner,
                              _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
      }
      two_64bits =
          _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
    }
    answer +=
        _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
  } else if (answer > 0) { // short string optimization
    for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
      answer += count_ones(non_ascii);
      latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
      non_ascii = (uint16_t)_mm_movemask_epi8(latin);
      answer += count_ones(non_ascii);
    }
    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
      answer += count_ones(non_ascii);
    }
  }
  return answer + scalar::latin1::utf8_length_from_latin1(
                      reinterpret_cast<const char *>(str + i), len - i);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return base64::compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return base64::compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return base64::compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return base64::compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64<true>(output, input, length, options);
  } else {
    return encode_base64<false>(output, input, length, options);
  }
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64_impl<true, true>(output, input, length, options,
                                          line_length);

  } else {
    return encode_base64_impl<false, true>(output, input, length, options,
                                           line_length);
  }
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util::find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util::find(start, end, character);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

} // namespace westmere
} // namespace simdutf

/* begin file src/simdutf/westmere/end.h */
#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif

#undef SIMDUTF_SIMD_HAS_BYTEMASK
/* end file src/simdutf/westmere/end.h */
/* end file src/westmere/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_LASX
/* begin file src/lasx/implementation.cpp */
/* begin file src/simdutf/lasx/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "lasx"
// #define SIMDUTF_IMPLEMENTATION lasx
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1

#if SIMDUTF_CAN_ALWAYS_RUN_LASX
// nothing needed.
#else
SIMDUTF_TARGET_LASX
#endif
/* end file src/simdutf/lasx/begin.h */
namespace simdutf {
namespace lasx {
namespace {
#ifndef SIMDUTF_LASX_H
  #error "lasx.h must be included"
#endif
using namespace simd;

// convert vmskltz/vmskgez/vmsknz to
// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
const uint8_t lasx_1_2_utf8_bytes_mask[] = {
    0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
    85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
    86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
    89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
    90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
    101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
    102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
    105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
    106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
    149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
    150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
    153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
    154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
    165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
    166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
    169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
    170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
    255};

simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
  return __lsx_vshuf4i_b(vec, 0b10110001);
}
simdutf_really_inline __m256i lasx_swap_bytes(__m256i vec) {
  return __lasx_xvshuf4i_b(vec, 0b10110001);
}

simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
  return input.is_ascii();
}

simdutf_really_inline simd8<bool>
must_be_2_3_continuation(const simd8<uint8_t> prev2,
                         const simd8<uint8_t> prev3) {
  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
  return is_third_byte ^ is_fourth_byte;
}

// common functions for utf8 conversions
simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
  // Low half contains  10bbbbbb|10cccccc
  // High half contains 1110aaaa|1110aaaa
  const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
  const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};

  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
  // 1110aaaa => aaaa0000
  __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
  // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
                                     perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
  // 0010bbbb bbcccccc => aaaabbbb bbcccccc
  composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);

  return composed;
}

simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
  // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
  __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
  // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
  composed = __lsx_vbitsel_v(
      __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
      __lsx_vsrli_h(composed, 8),                   /* bbbbbb >> 8 */
      __lsx_vrepli_h(0x3f));                        /* 0x003f */
  return composed;
}

simdutf_really_inline __m128i
convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
  // This is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes.
  __m128i sh =
      __lsx_vld(reinterpret_cast<const uint8_t *>(
                    simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
                0);
  // Shuffle
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 110aaaaa 10bbbbbb
  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000000 00bbbbbb
  __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
  // 1 byte: 00000000 00000000
  // 2 byte: 00000aaa aa000000
  __m128i v1f00 = lsx_splat_u16(0x1f00);
  __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
  // Combine with a shift right accumulate
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000aaa aabbbbbb
  composed = __lsx_vadd_h(ascii, composed);
  return composed;
}

/* begin file src/lasx/lasx_validate_utf32le.cpp */
const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
  const char32_t *end = input + size;

  // Performance degradation when memory address is not 32-byte aligned
  while (((uint64_t)input & 0x1F) && input < end) {
    uint32_t word = *input++;
    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
      return nullptr;
    }
  }

  __m256i offset = lasx_splat_u32(0xffff2000);
  __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff);
  __m256i standardmax = lasx_splat_u32(0x10ffff);
  __m256i currentmax = __lasx_xvldi(0x0);
  __m256i currentoffsetmax = __lasx_xvldi(0x0);

  while (input + 8 < end) {
    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
    currentmax = __lasx_xvmax_wu(in, currentmax);
    // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
    currentoffsetmax =
        __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
    input += 8;
  }
  __m256i is_zero =
      __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
  if (__lasx_xbnz_v(is_zero)) {
    return nullptr;
  }

  is_zero = __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
                           standardoffsetmax);
  if (__lasx_xbnz_v(is_zero)) {
    return nullptr;
  }
  return input;
}

const result lasx_validate_utf32le_with_errors(const char32_t *input,
                                               size_t size) {
  const char32_t *start = input;
  const char32_t *end = input + size;

  // Performance degradation when memory address is not 32-byte aligned
  while (((uint64_t)input & 0x1F) && input < end) {
    uint32_t word = *input;
    if (word > 0x10FFFF) {
      return result(error_code::TOO_LARGE, input - start);
    }
    if (word >= 0xD800 && word <= 0xDFFF) {
      return result(error_code::SURROGATE, input - start);
    }
    input++;
  }

  __m256i offset = lasx_splat_u32(0xffff2000);
  __m256i standardoffsetmax = lasx_splat_u32(0xfffff7ff);
  __m256i standardmax = lasx_splat_u32(0x10ffff);
  __m256i currentmax = __lasx_xvldi(0x0);
  __m256i currentoffsetmax = __lasx_xvldi(0x0);

  while (input + 8 < end) {
    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
    currentmax = __lasx_xvmax_wu(in, currentmax);
    currentoffsetmax =
        __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);

    __m256i is_zero =
        __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
    if (__lasx_xbnz_v(is_zero)) {
      return result(error_code::TOO_LARGE, input - start);
    }
    is_zero =
        __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
                       standardoffsetmax);
    if (__lasx_xbnz_v(is_zero)) {
      return result(error_code::SURROGATE, input - start);
    }
    input += 8;
  }

  return result(error_code::SUCCESS, input - start);
}
/* end file src/lasx/lasx_validate_utf32le.cpp */

/* begin file src/lasx/lasx_convert_latin1_to_utf8.cpp */
/*
  Returns a pair: the first unprocessed byte from buf and utf8_output
  A scalar routing should carry on the conversion of the tail.
*/

std::pair<const char *, char *>
lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
                            char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const size_t safety_margin = 12;
  const char *end = latin1_input + len;

  // We always write 16 bytes, of which more than the first 8 bytes
  // are valid. A safety margin of 8 is more than sufficient.
  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
    uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0);
    if (ascii_mask == 0xFFFF) {
      __lsx_vst(in8, utf8_output, 0);
      utf8_output += 16;
      latin1_input += 16;
      continue;
    }
    // We just fallback on UTF-16 code. This could be optimized/simplified
    // further.
    __m256i in16 = __lasx_vext2xv_hu_bu(____m256i(in8));
    // 1. prepare 2-byte values
    // input 8-bit word : [aabb|bbbb] x 16
    // expected output   : [1100|00aa|10bb|bbbb] x 16
    // t0 = [0000|00aa|bbbb|bb00]
    __m256i t0 = __lasx_xvslli_h(in16, 2);
    // t1 = [0000|00aa|0000|0000]
    __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x300));
    // t3 = [0000|00aa|00bb|bbbb]
    __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f));
    // t4 = [1100|00aa|10bb|bbbb]
    __m256i t3 = __lasx_xvor_v(t2, __lasx_xvreplgr2vr_h(uint16_t(0xc080)));
    // merge ASCII and 2-byte codewords
    __m256i one_byte_bytemask = __lasx_xvsle_hu(in16, __lasx_xvrepli_h(0x7F));
    __m256i utf8_unpacked = __lasx_xvbitsel_v(t3, in16, one_byte_bytemask);

    const uint8_t *row0 =
        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
            [lasx_1_2_utf8_bytes_mask[(ascii_mask & 0xFF)]][0];
    __m128i shuffle0 = __lsx_vld(row0 + 1, 0);
    __m128i utf8_unpacked_lo = lasx_extracti128_lo(utf8_unpacked);
    __m128i utf8_packed0 =
        __lsx_vshuf_b(utf8_unpacked_lo, utf8_unpacked_lo, shuffle0);
    __lsx_vst(utf8_packed0, utf8_output, 0);
    utf8_output += row0[0];

    const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                              [lasx_1_2_utf8_bytes_mask[(ascii_mask >> 8)]][0];
    __m128i shuffle1 = __lsx_vld(row1 + 1, 0);
    __m128i utf8_unpacked_hi = lasx_extracti128_hi(utf8_unpacked);
    __m128i utf8_packed1 =
        __lsx_vshuf_b(utf8_unpacked_hi, utf8_unpacked_hi, shuffle1);
    __lsx_vst(utf8_packed1, utf8_output, 0);
    utf8_output += row1[0];

    latin1_input += 16;
  } // while

  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
}
/* end file src/lasx/lasx_convert_latin1_to_utf8.cpp */
/* begin file src/lasx/lasx_convert_latin1_to_utf32.cpp */
std::pair<const char *, char32_t *>
lasx_convert_latin1_to_utf32(const char *buf, size_t len,
                             char32_t *utf32_output) {
  const char *end = buf + len;

  // LASX requires 32-byte alignment, otherwise performance will be degraded
  while (((uint64_t)utf32_output & 0x1F) && buf < end) {
    *utf32_output++ = ((uint32_t)*buf) & 0xFF;
    buf++;
  }

  while (end - buf >= 32) {
    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);

    __m256i in32_0 = __lasx_vext2xv_wu_bu(in8);
    __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);

    __m256i in8_1 = __lasx_xvpermi_d(in8, 0b00000001);
    __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
    __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 32);

    __m256i in8_2 = __lasx_xvpermi_d(in8, 0b00000010);
    __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
    __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 64);

    __m256i in8_3 = __lasx_xvpermi_d(in8, 0b00000011);
    __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
    __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 96);

    utf32_output += 32;
    buf += 32;
  }

  if (end - buf >= 16) {
    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);

    __m128i zero = __lsx_vldi(0);
    __m128i in16low = __lsx_vilvl_b(zero, in8);
    __m128i in16high = __lsx_vilvh_b(zero, in8);
    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);

    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);

    utf32_output += 16;
    buf += 16;
  }

  return std::make_pair(buf, utf32_output);
}
/* end file src/lasx/lasx_convert_latin1_to_utf32.cpp */

/* begin file src/lasx/lasx_convert_utf8_to_utf32.cpp */
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_out) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xFFF;
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
    // We process in chunks of 16 bytes.
    // use fast implementation in src/simdutf/arm64/simd.h
    // Ideally the compiler can keep the tables in registers.
    __m128i zero = __lsx_vldi(0);
    __m128i in16low = __lsx_vilvl_b(zero, in);
    __m128i in16high = __lsx_vilvh_b(zero, in);
    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);

    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);

    utf32_output += 16; // We wrote 16 32-bit characters.
    return 16;          // We consumed 16 bytes.
  }
  __m128i zero = __lsx_vldi(0);
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units. Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return 12;         // We consumed 12 bytes.
  }
  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
  if (input_utf8_end_of_code_point_mask == 0xaaa) {
    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
    // UTF-32 code units. Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);

    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
    utf32_output += 6;
    return 12; // We consumed 12 bytes.
  }
  // Either no fast path or an unimportant fast path.

  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];

  if (idx < 64) {
    // SIX (6) input code-code units
    // Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
    utf32_output += 6;
    return consumed;
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                           0);
    // Shuffle
    // 1 byte: 00000000 00000000 0ccccccc
    // 2 byte: 00000000 110bbbbb 10cccccc
    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
    __m128i perm = __lsx_vshuf_b(zero, in, sh);
    // Split
    // 00000000 00000000 0ccccccc
    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
    // Note: unmasked
    // xxxxxxxx aaaaxxxx xxxxxxxx
    __m128i high =
        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
    // Use 16 bit bic instead of and.
    // The top bits will be corrected later in the bsl
    // 00000000 10bbbbbb 00000000
    __m128i middle =
        __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
    // Combine low and middle with shift right accumulate
    // 00000000 00xxbbbb bbcccccc
    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
    // Insert top 4 bits from high byte with bitwise select
    // 00000000 aaaabbbb bbcccccc
    __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
    __lsx_vst(composed, utf32_output, 0);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return consumed;
  } else if (idx < 209) {
    // THREE (3) input code-code units
    if (input_utf8_end_of_code_point_mask == 0x888) {
      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
      // UTF-32 code units. This uses the same method as the fixed 3 byte
      // version, reversing and shift left insert. However, there is no need for
      // a shuffle mask now, just rev16 and rev32.
      //
      // This version does not use the LUT, but 4 byte sequences are less common
      // and the overhead of the extra memory access is less important than the
      // early branch overhead in shorter sequences, so it comes last.

      // Swap pairs of bytes
      // 10dddddd|10cccccc|10bbbbbb|11110aaa
      // 10cccccc 10dddddd|11110aaa 10bbbbbb
      __m128i swap = lsx_swap_bytes(in);
      // Shift left and insert
      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
                                       __lsx_vrepli_h(0x3f /*0x003F*/));
      // Shift insert again
      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
      __m128i merge2 =
          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
                          lsx_splat_u32(0x00000FFF));
      // Clear the garbage
      // 00000000 000aaabb bbbbcccc ccdddddd
      __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
      // Store
      __lsx_vst(composed, utf32_output, 0);
      utf32_output += 3; // We wrote 3 32-bit characters.
      return 12;         // We consumed 12 bytes.
    }
    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
    // due to surrogates no longer being involved.
    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                           0);
    // 1 byte: 00000000 00000000 00000000 0ddddddd
    // 2 byte: 00000000 00000000 110ccccc 10dddddd
    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
    __m128i perm = __lsx_vshuf_b(zero, in, sh);

    // Ascii
    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
    __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
    // 00000000 00000000 0000cccc ccdddddd
    __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);

    __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
    // Insert twice
    // 00000000 000aaabb bbbbxxxx xxxxxxxx
    __m128i corrected_srli2 =
        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
    __m128i ab =
        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
    ab = __lsx_vsrli_w(ab, 4);
    // 00000000 000aaabb bbbbcccc ccdddddd
    __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
    // Store
    __lsx_vst(composed, utf32_output, 0);
    utf32_output += 3; // We wrote 3 32-bit characters.
    return consumed;
  } else {
    // here we know that there is an error but we do not handle errors
    return 12;
  }
}
/* end file src/lasx/lasx_convert_utf8_to_utf32.cpp */
/* begin file src/lasx/lasx_convert_utf8_to_latin1.cpp */
size_t convert_masked_utf8_to_latin1(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
                                     char *&latin1_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);

  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xfff;
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.

  // We first try a few fast paths.
  // The obvious first test is ASCII, which actually consumes the full 16.
  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
    // We process in chunks of 16 bytes
    __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
    latin1_output += 16; // We wrote 16 18-bit characters.
    return 16;           // We consumed 16 bytes.
  }
  /// We do not have a fast path available, or the fast path is unimportant, so
  /// we fallback.
  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];

  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];
  // this indicates an invalid input:
  if (idx >= 64) {
    return consumed;
  }
  // Here we should have (idx < 64), if not, there is a bug in the validation or
  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
  // scenario we process SIX (6) input code-code units. The max length in bytes
  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
  __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                             simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                         0);
  // Shuffle
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 110aaaaa 10bbbbbb
  sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
  // ascii mask
  // 1 byte: 11111111 11111111
  // 2 byte: 00000000 00000000
  __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
  // utf8 mask
  // 1 byte: 00000000 00000000
  // 2 byte: 00111111 00111111
  __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
                                   __lsx_vldi(0b00111111));
  // mask
  //  1 byte: 11111111 11111111
  //  2 byte: 00111111 00111111
  __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);

  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
  // writing 8 bytes even though we only care about the first 6 bytes.
  __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);

  __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
  latin1_output += 6; // We wrote 6 bytes.
  return consumed;
}
/* end file src/lasx/lasx_convert_utf8_to_latin1.cpp */

/* begin file src/lasx/lasx_convert_utf32_to_latin1.cpp */
std::pair<const char32_t *, char *>
lasx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
                             char *latin1_output) {
  const char32_t *end = buf + len;
  const __m256i shuf_mask = ____m256i(
      (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
  __m256i v_ff = __lasx_xvrepli_w(0xFF);

  while (end - buf >= 16) {
    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);

    __m256i in12 = __lasx_xvor_v(in1, in2);
    if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
      // 1. pack the bytes
      __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
      latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
      __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
      latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
      // 2. store (8 bytes)
      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
      // 3. adjust pointers
      buf += 16;
      latin1_output += 16;
    } else {
      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
    }
  } // while
  return std::make_pair(buf, latin1_output);
}

std::pair<result, char *>
lasx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                         char *latin1_output) {
  const char32_t *start = buf;
  const char32_t *end = buf + len;

  const __m256i shuf_mask = ____m256i(
      (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
  __m256i v_ff = __lasx_xvrepli_w(0xFF);

  while (end - buf >= 16) {
    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);

    __m256i in12 = __lasx_xvor_v(in1, in2);
    if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
      // 1. pack the bytes
      __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
      latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
      __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
      latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
      // 2. store (8 bytes)
      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
      // 3. adjust pointers
      buf += 16;
      latin1_output += 16;
    } else {
      // Let us do a scalar fallback.
      for (int k = 0; k < 16; k++) {
        uint32_t word = buf[k];
        if (word <= 0xff) {
          *latin1_output++ = char(word);
        } else {
          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                latin1_output);
        }
      }
    }
  } // while
  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        latin1_output);
}
/* end file src/lasx/lasx_convert_utf32_to_latin1.cpp */
/* begin file src/lasx/lasx_convert_utf32_to_utf8.cpp */
std::pair<const char32_t *, char *>
lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char32_t *end = buf + len;

  // load addr align 32
  while (((uint64_t)buf & 0x1F) && buf < end) {
    uint32_t word = *buf;
    if ((word & 0xFFFFFF80) == 0) {
      *utf8_output++ = char(word);
    } else if ((word & 0xFFFFF800) == 0) {
      *utf8_output++ = char((word >> 6) | 0b11000000);
      *utf8_output++ = char((word & 0b111111) | 0b10000000);
    } else if ((word & 0xFFFF0000) == 0) {
      if (word >= 0xD800 && word <= 0xDFFF) {
        return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
      }
      *utf8_output++ = char((word >> 12) | 0b11100000);
      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
      *utf8_output++ = char((word & 0b111111) | 0b10000000);
    } else {
      if (word > 0x10FFFF) {
        return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
      }
      *utf8_output++ = char((word >> 18) | 0b11110000);
      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
      *utf8_output++ = char((word & 0b111111) | 0b10000000);
    }
    buf++;
  }

  __m256i v_c080 = lasx_splat_u16(0xc080);
  __m256i v_07ff = lasx_splat_u16(0x07ff);
  __m256i v_dfff = lasx_splat_u16(0xdfff);
  __m256i v_d800 = lasx_splat_u16(0xd800);
  __m256i zero = __lasx_xvldi(0);
  __m128i zero_128 = __lsx_vldi(0);
  __m256i forbidden_bytemask = __lasx_xvldi(0x0);

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);

    // Check if no bits set above 16th
    if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
      // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
      __m256i utf16_packed =
          __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);

      if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
                                       utf16_packed))) { // ASCII fast path!!!!
        // 1. pack the bytes
        // obviously suboptimal.
        __m256i utf8_packed = __lasx_xvpermi_d(
            __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
        // 2. store (8 bytes)
        __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
        // 3. adjust pointers
        buf += 16;
        utf8_output += 16;
        continue; // we are done for this round!
      }

      if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
        // 1. prepare 2-byte values
        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
        // expected output   : [110a|aaaa|10bb|bbbb] x 8

        // t0 = [000a|aaaa|bbbb|bb00]
        const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
        // t1 = [000a|aaaa|0000|0000]
        const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
        // t2 = [0000|0000|00bb|bbbb]
        const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
        // t3 = [000a|aaaa|00bb|bbbb]
        const __m256i t3 = __lasx_xvor_v(t1, t2);
        // t4 = [110a|aaaa|10bb|bbbb]
        const __m256i t4 = __lasx_xvor_v(t3, v_c080);
        // 2. merge ASCII and 2-byte codewords
        __m256i one_byte_bytemask =
            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
        __m256i utf8_unpacked =
            __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
        // 3. prepare bitmask for 8-bit lookup
        __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
        uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
        uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
        // 4. pack the bytes
        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                [lasx_1_2_utf8_bytes_mask[m1]][0];
        __m128i shuffle1 = __lsx_vld(row1, 1);
        __m128i utf8_packed1 = __lsx_vshuf_b(
            zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);

        const uint8_t *row2 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                [lasx_1_2_utf8_bytes_mask[m2]][0];
        __m128i shuffle2 = __lsx_vld(row2, 1);
        __m128i utf8_packed2 = __lsx_vshuf_b(
            zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
        // 5. store bytes
        __lsx_vst(utf8_packed1, utf8_output, 0);
        utf8_output += row1[0];

        __lsx_vst(utf8_packed2, utf8_output, 0);
        utf8_output += row2[0];

        buf += 16;
        continue;
      } else {
        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
        forbidden_bytemask = __lasx_xvor_v(
            __lasx_xvand_v(
                __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
                __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
            forbidden_bytemask);
        /* In this branch we handle three cases:
            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
           single UFT-8 byte
            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
           two UTF-8 bytes
            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
           three UTF-8 bytes

            We expand the input word (16-bit) into two code units (32-bit), thus
            we have room for four bytes. However, we need five distinct bit
            layouts. Note that the last byte in cases #2 and #3 is the same.

            We precompute byte 1 for case #1 and the common byte for cases #2 &
           #3 in register t2.

            We precompute byte 1 for case #3 and -- **conditionally** --
           precompute either byte 1 for case #2 or byte 2 for case #3. Note that
           they differ by exactly one bit.

            Finally from these two code units we build proper UTF-8 sequence,
           taking into account the case (i.e, the number of bytes to write).
        */
        /**
         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
         * t2 => [0ccc|cccc] [10cc|cccc]
         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
         */
        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
        __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
        t0 = __lasx_xvilvl_b(t0, t0);
        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
        __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
        __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
        __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));

        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
        __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
        __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
        s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3f00));
        // [00bb|bbbb|0000|aaaa]
        __m256i s2 = __lasx_xvor_v(s0, s1);
        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
        __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
        __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
        // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
        __m256i one_or_two_bytes_bytemask =
            __lasx_xvsle_hu(utf16_packed, v_07ff);
        __m256i m0 =
            __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
        __m256i s4 = __lasx_xvxor_v(s3, m0);

        // 4. expand code units 16-bit => 32-bit
        __m256i out0 = __lasx_xvilvl_h(s4, t2);
        __m256i out1 = __lasx_xvilvh_h(s4, t2);

        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
        __m256i one_byte_bytemask =
            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));

        __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
            __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
        __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
            __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);

        __m256i one_byte_bytemask_u16_to_u32_low =
            __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
        __m256i one_byte_bytemask_u16_to_u32_high =
            __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);

        __m256i mask0 = __lasx_xvmskltz_h(
            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
                          one_byte_bytemask_u16_to_u32_low));
        __m256i mask1 = __lasx_xvmskltz_h(
            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
                          one_byte_bytemask_u16_to_u32_high));

        uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
        const uint8_t *row0 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle0 = __lsx_vld(row0, 1);
        __m128i utf8_0 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
        __lsx_vst(utf8_0, utf8_output, 0);
        utf8_output += row0[0];

        mask = __lasx_xvpickve2gr_wu(mask1, 0);
        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle1 = __lsx_vld(row1, 1);
        __m128i utf8_1 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
        __lsx_vst(utf8_1, utf8_output, 0);
        utf8_output += row1[0];

        mask = __lasx_xvpickve2gr_wu(mask0, 4);
        const uint8_t *row2 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle2 = __lsx_vld(row2, 1);
        __m128i utf8_2 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
        __lsx_vst(utf8_2, utf8_output, 0);
        utf8_output += row2[0];

        mask = __lasx_xvpickve2gr_wu(mask1, 4);
        const uint8_t *row3 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle3 = __lsx_vld(row3, 1);
        __m128i utf8_3 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
        __lsx_vst(utf8_3, utf8_output, 0);
        utf8_output += row3[0];

        buf += 16;
      }
      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes.
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(nullptr,
                                  reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(nullptr,
                                  reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  // check for invalid input
  if (__lasx_xbnz_v(forbidden_bytemask)) {
    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
  }
  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
}

std::pair<result, char *>
lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
                                       char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char32_t *start = buf;
  const char32_t *end = buf + len;

  // load addr align 32
  while (((uint64_t)buf & 0x1F) && buf < end) {
    uint32_t word = *buf;
    if ((word & 0xFFFFFF80) == 0) {
      *utf8_output++ = char(word);
    } else if ((word & 0xFFFFF800) == 0) {
      *utf8_output++ = char((word >> 6) | 0b11000000);
      *utf8_output++ = char((word & 0b111111) | 0b10000000);
    } else if ((word & 0xFFFF0000) == 0) {
      if (word >= 0xD800 && word <= 0xDFFF) {
        return std::make_pair(result(error_code::SURROGATE, buf - start),
                              reinterpret_cast<char *>(utf8_output));
      }
      *utf8_output++ = char((word >> 12) | 0b11100000);
      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
      *utf8_output++ = char((word & 0b111111) | 0b10000000);
    } else {
      if (word > 0x10FFFF) {
        return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                              reinterpret_cast<char *>(utf8_output));
      }
      *utf8_output++ = char((word >> 18) | 0b11110000);
      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
      *utf8_output++ = char((word & 0b111111) | 0b10000000);
    }
    buf++;
  }

  __m256i v_c080 = lasx_splat_u16(0xc080);
  __m256i v_07ff = lasx_splat_u16(0x07ff);
  __m256i v_dfff = lasx_splat_u16(0xdfff);
  __m256i v_d800 = lasx_splat_u16(0xd800);
  __m256i zero = __lasx_xvldi(0);
  __m128i zero_128 = __lsx_vldi(0);
  __m256i forbidden_bytemask = __lasx_xvldi(0x0);
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);

    // Check if no bits set above 16th
    if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
      // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
      __m256i utf16_packed =
          __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);

      if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
                                       utf16_packed))) { // ASCII fast path!!!!
        // 1. pack the bytes
        // obviously suboptimal.
        __m256i utf8_packed = __lasx_xvpermi_d(
            __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
        // 2. store (8 bytes)
        __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
        // 3. adjust pointers
        buf += 16;
        utf8_output += 16;
        continue; // we are done for this round!
      }

      if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
        // 1. prepare 2-byte values
        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
        // expected output   : [110a|aaaa|10bb|bbbb] x 8

        // t0 = [000a|aaaa|bbbb|bb00]
        const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
        // t1 = [000a|aaaa|0000|0000]
        const __m256i t1 = __lasx_xvand_v(t0, lasx_splat_u16(0x1f00));
        // t2 = [0000|0000|00bb|bbbb]
        const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
        // t3 = [000a|aaaa|00bb|bbbb]
        const __m256i t3 = __lasx_xvor_v(t1, t2);
        // t4 = [110a|aaaa|10bb|bbbb]
        const __m256i t4 = __lasx_xvor_v(t3, v_c080);
        // 2. merge ASCII and 2-byte codewords
        __m256i one_byte_bytemask =
            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
        __m256i utf8_unpacked =
            __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
        // 3. prepare bitmask for 8-bit lookup
        __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
        uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
        uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
        // 4. pack the bytes
        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                [lasx_1_2_utf8_bytes_mask[m1]][0];
        __m128i shuffle1 = __lsx_vld(row1, 1);
        __m128i utf8_packed1 = __lsx_vshuf_b(
            zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);

        const uint8_t *row2 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                [lasx_1_2_utf8_bytes_mask[m2]][0];
        __m128i shuffle2 = __lsx_vld(row2, 1);
        __m128i utf8_packed2 = __lsx_vshuf_b(
            zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
        // 5. store bytes
        __lsx_vst(utf8_packed1, utf8_output, 0);
        utf8_output += row1[0];

        __lsx_vst(utf8_packed2, utf8_output, 0);
        utf8_output += row2[0];

        buf += 16;
        continue;
      } else {
        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
        forbidden_bytemask = __lasx_xvor_v(
            __lasx_xvand_v(
                __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
                __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
            forbidden_bytemask);
        if (__lasx_xbnz_v(forbidden_bytemask)) {
          return std::make_pair(result(error_code::SURROGATE, buf - start),
                                reinterpret_cast<char *>(utf8_output));
        }
        /* In this branch we handle three cases:
            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
           single UFT-8 byte
            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
           two UTF-8 bytes
            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
           three UTF-8 bytes

            We expand the input word (16-bit) into two code units (32-bit), thus
            we have room for four bytes. However, we need five distinct bit
            layouts. Note that the last byte in cases #2 and #3 is the same.

            We precompute byte 1 for case #1 and the common byte for cases #2 &
           #3 in register t2.

            We precompute byte 1 for case #3 and -- **conditionally** --
           precompute either byte 1 for case #2 or byte 2 for case #3. Note that
           they differ by exactly one bit.

            Finally from these two code units we build proper UTF-8 sequence,
           taking into account the case (i.e, the number of bytes to write).
        */
        /**
         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
         * t2 => [0ccc|cccc] [10cc|cccc]
         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
         */
        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
        __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
        t0 = __lasx_xvilvl_b(t0, t0);
        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
        __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
        __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
        __m256i t2 = __lasx_xvor_v(t1, lasx_splat_u16(0x8000));

        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
        __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
        __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
        s1 = __lasx_xvand_v(s1, lasx_splat_u16(0x3F00));
        // [00bb|bbbb|0000|aaaa]
        __m256i s2 = __lasx_xvor_v(s0, s1);
        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
        __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
        __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
        // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
        __m256i one_or_two_bytes_bytemask =
            __lasx_xvsle_hu(utf16_packed, v_07ff);
        __m256i m0 =
            __lasx_xvandn_v(one_or_two_bytes_bytemask, lasx_splat_u16(0x4000));
        __m256i s4 = __lasx_xvxor_v(s3, m0);

        // 4. expand code units 16-bit => 32-bit
        __m256i out0 = __lasx_xvilvl_h(s4, t2);
        __m256i out1 = __lasx_xvilvh_h(s4, t2);

        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
        __m256i one_byte_bytemask =
            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));

        __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
            __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
        __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
            __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);

        __m256i one_byte_bytemask_u16_to_u32_low =
            __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
        __m256i one_byte_bytemask_u16_to_u32_high =
            __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);

        __m256i mask0 = __lasx_xvmskltz_h(
            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
                          one_byte_bytemask_u16_to_u32_low));
        __m256i mask1 = __lasx_xvmskltz_h(
            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
                          one_byte_bytemask_u16_to_u32_high));

        uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
        const uint8_t *row0 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle0 = __lsx_vld(row0, 1);
        __m128i utf8_0 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
        __lsx_vst(utf8_0, utf8_output, 0);
        utf8_output += row0[0];

        mask = __lasx_xvpickve2gr_wu(mask1, 0);
        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle1 = __lsx_vld(row1, 1);
        __m128i utf8_1 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
        __lsx_vst(utf8_1, utf8_output, 0);
        utf8_output += row1[0];

        mask = __lasx_xvpickve2gr_wu(mask0, 4);
        const uint8_t *row2 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle2 = __lsx_vld(row2, 1);
        __m128i utf8_2 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
        __lsx_vst(utf8_2, utf8_output, 0);
        utf8_output += row2[0];

        mask = __lasx_xvpickve2gr_wu(mask1, 4);
        const uint8_t *row3 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
                                                                  [0];
        __m128i shuffle3 = __lsx_vld(row3, 1);
        __m128i utf8_3 =
            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
        __lsx_vst(utf8_3, utf8_output, 0);
        utf8_output += row3[0];

        buf += 16;
      }
      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes.
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(
                result(error_code::SURROGATE, buf - start + k),
                reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(
                result(error_code::TOO_LARGE, buf - start + k),
                reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        reinterpret_cast<char *>(utf8_output));
}
/* end file src/lasx/lasx_convert_utf32_to_utf8.cpp */
/* begin file src/lasx/lasx_base64.cpp */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */

template <bool isbase64url>
size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {
  // credit: Wojciech Muła
  // SSE (lookup: pshufb improved unrolled)
  const uint8_t *input = (const uint8_t *)src;
  static const char *lookup_tbl =
      isbase64url
          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  uint8_t *out = (uint8_t *)dst;

  v32u8 shuf;
  __m256i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
      base64_tbl2, base64_tbl3;
  if (srclen >= 28) {
    shuf = v32u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
                 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};

    v_fc0fc00 = __lasx_xvreplgr2vr_w(uint32_t(0x0fc0fc00));
    v_3f03f0 = __lasx_xvreplgr2vr_w(uint32_t(0x003f03f0));
    shift_r = __lasx_xvreplgr2vr_w(uint32_t(0x0006000a));
    shift_l = __lasx_xvreplgr2vr_w(uint32_t(0x00080004));
    base64_tbl0 = ____m256i(__lsx_vld(lookup_tbl, 0));
    base64_tbl1 = ____m256i(__lsx_vld(lookup_tbl, 16));
    base64_tbl2 = ____m256i(__lsx_vld(lookup_tbl, 32));
    base64_tbl3 = ____m256i(__lsx_vld(lookup_tbl, 48));
  }
  size_t i = 0;
  for (; i + 100 <= srclen; i += 96) {
    __m128i in0_lo =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
    __m128i in0_hi =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
    __m128i in1_lo =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
    __m128i in1_hi =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
    __m128i in2_lo =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 4);
    __m128i in2_hi =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 5);
    __m128i in3_lo =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 6);
    __m128i in3_hi =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 7);

    __m256i in0 = lasx_set_q(in0_hi, in0_lo);
    __m256i in1 = lasx_set_q(in1_hi, in1_lo);
    __m256i in2 = lasx_set_q(in2_hi, in2_lo);
    __m256i in3 = lasx_set_q(in3_hi, in3_lo);

    in0 = __lasx_xvshuf_b(in0, in0, (__m256i)shuf);
    in1 = __lasx_xvshuf_b(in1, in1, (__m256i)shuf);
    in2 = __lasx_xvshuf_b(in2, in2, (__m256i)shuf);
    in3 = __lasx_xvshuf_b(in3, in3, (__m256i)shuf);

    __m256i t0_0 = __lasx_xvand_v(in0, v_fc0fc00);
    __m256i t0_1 = __lasx_xvand_v(in1, v_fc0fc00);
    __m256i t0_2 = __lasx_xvand_v(in2, v_fc0fc00);
    __m256i t0_3 = __lasx_xvand_v(in3, v_fc0fc00);

    __m256i t1_0 = __lasx_xvsrl_h(t0_0, shift_r);
    __m256i t1_1 = __lasx_xvsrl_h(t0_1, shift_r);
    __m256i t1_2 = __lasx_xvsrl_h(t0_2, shift_r);
    __m256i t1_3 = __lasx_xvsrl_h(t0_3, shift_r);

    __m256i t2_0 = __lasx_xvand_v(in0, v_3f03f0);
    __m256i t2_1 = __lasx_xvand_v(in1, v_3f03f0);
    __m256i t2_2 = __lasx_xvand_v(in2, v_3f03f0);
    __m256i t2_3 = __lasx_xvand_v(in3, v_3f03f0);

    __m256i t3_0 = __lasx_xvsll_h(t2_0, shift_l);
    __m256i t3_1 = __lasx_xvsll_h(t2_1, shift_l);
    __m256i t3_2 = __lasx_xvsll_h(t2_2, shift_l);
    __m256i t3_3 = __lasx_xvsll_h(t2_3, shift_l);

    __m256i input0 = __lasx_xvor_v(t1_0, t3_0);
    __m256i input0_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input0);
    __m256i input0_shuf1 = __lasx_xvshuf_b(
        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input0, __lasx_xvldi(32)));
    __m256i input0_mask = __lasx_xvslei_bu(input0, 31);
    __m256i input0_result =
        __lasx_xvbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
    __lasx_xvst(input0_result, reinterpret_cast<__m256i *>(out), 0);
    out += 32;

    __m256i input1 = __lasx_xvor_v(t1_1, t3_1);
    __m256i input1_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input1);
    __m256i input1_shuf1 = __lasx_xvshuf_b(
        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input1, __lasx_xvldi(32)));
    __m256i input1_mask = __lasx_xvslei_bu(input1, 31);
    __m256i input1_result =
        __lasx_xvbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
    __lasx_xvst(input1_result, reinterpret_cast<__m256i *>(out), 0);
    out += 32;

    __m256i input2 = __lasx_xvor_v(t1_2, t3_2);
    __m256i input2_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input2);
    __m256i input2_shuf1 = __lasx_xvshuf_b(
        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input2, __lasx_xvldi(32)));
    __m256i input2_mask = __lasx_xvslei_bu(input2, 31);
    __m256i input2_result =
        __lasx_xvbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
    __lasx_xvst(input2_result, reinterpret_cast<__m256i *>(out), 0);
    out += 32;

    __m256i input3 = __lasx_xvor_v(t1_3, t3_3);
    __m256i input3_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input3);
    __m256i input3_shuf1 = __lasx_xvshuf_b(
        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input3, __lasx_xvldi(32)));
    __m256i input3_mask = __lasx_xvslei_bu(input3, 31);
    __m256i input3_result =
        __lasx_xvbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
    __lasx_xvst(input3_result, reinterpret_cast<__m256i *>(out), 0);
    out += 32;
  }
  for (; i + 28 <= srclen; i += 24) {

    __m128i in_lo = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
    __m128i in_hi =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);

    __m256i in = lasx_set_q(in_hi, in_lo);

    // bytes from groups A, B and C are needed in separate 32-bit lanes
    // in = [DDDD|CCCC|BBBB|AAAA]
    //
    //      an input triplet has layout
    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
    //        triplet
    //
    //      shuffling changes the order of bytes: 1, 0, 2, 1
    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
    //                  processed bits
    in = __lasx_xvshuf_b(in, in, (__m256i)shuf);

    // unpacking
    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
    __m256i t0 = __lasx_xvand_v(in, v_fc0fc00);
    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
    //          ((c >> 6),  (a >> 10))
    __m256i t1 = __lasx_xvsrl_h(t0, shift_r);

    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
    __m256i t2 = __lasx_xvand_v(in, v_3f03f0);
    // t3    = [00dddddd|00000000|00bbbbbb|00000000]
    //          ((d << 8), (b << 4))
    __m256i t3 = __lasx_xvsll_h(t2, shift_l);

    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
    __m256i indices = __lasx_xvor_v(t1, t3);

    __m256i indices_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, indices);
    __m256i indices_shuf1 = __lasx_xvshuf_b(
        base64_tbl3, base64_tbl2, __lasx_xvsub_b(indices, __lasx_xvldi(32)));
    __m256i indices_mask = __lasx_xvslei_bu(indices, 31);
    __m256i indices_result =
        __lasx_xvbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
    __lasx_xvst(indices_result, reinterpret_cast<__m256i *>(out), 0);
    out += 32;
  }

  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                        srclen - i, options);
}

static inline void compress(__m128i data, uint16_t mask, char *output) {
  if (mask == 0) {
    __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
    return;
  }
  // this particular implementation was inspired by work done by @animetosho
  // we do it in two steps, first 8 bytes and then second 8 bytes
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
  // next line just loads the 64-bit values thintable_epi8[mask1] and
  // thintable_epi8[mask2] into a 128-bit register, using only
  // two instructions on most compilers.

  v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
                    tables::base64::thintable_epi8[mask2]};

  // we increment by 0x08 the second half of the mask
  const v4u32 hi = {0, 0, 0x08080808, 0x08080808};
  __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);

  // this is the version "nearly pruned"
  __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
  // we still need to put the two halves together.
  // we compute the popcount of the first half:
  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
  // then load the corresponding mask, what it does is to write
  // only the first pop1 bytes from the first 8 bytes, and then
  // it fills in with the bytes from the second 8 bytes + some filling
  // at the end.
  __m128i compactmask =
      __lsx_vld(reinterpret_cast<const __m128i *>(
                    tables::base64::pshufb_combine_table + pop1 * 8),
                0);
  __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);

  __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
}

struct block64 {
  __m256i chunks[2];
};

template <bool base64_url, bool default_or_url>
static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
  __m256i ascii_space_tbl =
      ____m256i((__m128i)v16u8{0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
                               0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0});
  // credit: aqrit
  __m256i delta_asso;
  if (default_or_url) {
    delta_asso =
        ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
                                 0x0, 0x0, 0x0, 0x0, 0x11, 0x0, 0x16});
  } else {
    delta_asso =
        ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0,
                                 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF});
  }
  __m256i delta_values;
  if (default_or_url) {
    delta_values = ____m256i(
        (__m128i)v16i8{int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0x13),
                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
                       int8_t(0xB9), int8_t(0x00), int8_t(0xFF), int8_t(0x11),
                       int8_t(0xFF), int8_t(0xBF), int8_t(0x10), int8_t(0xB9)});
  } else if (base64_url) {
    delta_values = ____m256i(
        (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
                       int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
                       int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)});
  } else {
    delta_values = ____m256i(
        (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
                       int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
                       int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)});
  }

  __m256i check_asso;
  if (default_or_url) {
    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
                                          0x0B, 0x0E, 0x0B, 0x06});

  } else if (base64_url) {
    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
                                          0x0B, 0x06, 0x0B, 0x12});
  } else {
    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
                                          0x0B, 0x0B, 0x0B, 0x0F});
  }

  __m256i check_values;
  if (default_or_url) {

    check_values = ____m256i(
        (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
                       int8_t(0xB5), int8_t(0xA1), int8_t(0x00), int8_t(0x80),
                       int8_t(0x00), int8_t(0x80), int8_t(0x00), int8_t(0x80)});
  } else if (base64_url) {
    check_values = ____m256i(
        (__m128i)v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
                       int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
                       int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)});
  } else {
    check_values = ____m256i(
        (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
                       int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
                       int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)});
  }

  __m256i shifted = __lasx_xvsrli_b(*src, 3);
  __m256i asso_index = __lasx_xvand_v(*src, __lasx_xvldi(0xF));
  __m256i delta_hash = __lasx_xvavgr_bu(
      __lasx_xvshuf_b(delta_asso, delta_asso, asso_index), shifted);
  __m256i check_hash = __lasx_xvavgr_bu(
      __lasx_xvshuf_b(check_asso, check_asso, asso_index), shifted);

  __m256i out = __lasx_xvsadd_b(
      __lasx_xvshuf_b(delta_values, delta_values, delta_hash), *src);
  __m256i chk = __lasx_xvsadd_b(
      __lasx_xvshuf_b(check_values, check_values, check_hash), *src);
  __m256i chk_ltz = __lasx_xvmskltz_b(chk);
  unsigned int mask = __lasx_xvpickve2gr_wu(chk_ltz, 0);
  mask = mask | (__lsx_vpickve2gr_hu(lasx_extracti128_hi(chk_ltz), 0) << 16);
  if (mask) {
    __m256i ascii_space = __lasx_xvseq_b(
        __lasx_xvshuf_b(ascii_space_tbl, ascii_space_tbl, asso_index), *src);
    __m256i ascii_space_ltz = __lasx_xvmskltz_b(ascii_space);
    unsigned int ascii_space_mask = __lasx_xvpickve2gr_wu(ascii_space_ltz, 0);
    ascii_space_mask =
        ascii_space_mask |
        (__lsx_vpickve2gr_hu(lasx_extracti128_hi(ascii_space_ltz), 0) << 16);
    *error |= (mask != ascii_space_mask);
  }

  *src = out;
  return (uint32_t)mask;
}

template <bool base64_url, bool default_or_url>
static inline uint64_t to_base64_mask(block64 *b, bool *error) {
  *error = 0;
  uint64_t m0 =
      to_base64_mask<base64_url, default_or_url>(&b->chunks[0], error);
  uint64_t m1 =
      to_base64_mask<base64_url, default_or_url>(&b->chunks[1], error);
  return m0 | (m1 << 32);
}

static inline void copy_block(block64 *b, char *output) {
  __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output), 0);
  __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output), 32);
}

static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
  uint64_t nmask = ~mask;
  uint64_t count =
      __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
  uint16_t *count_ptr = (uint16_t *)&count;
  compress(lasx_extracti128_lo(b->chunks[0]), uint16_t(mask), output);
  compress(lasx_extracti128_hi(b->chunks[0]), uint16_t(mask >> 16),
           output + count_ptr[0]);
  compress(lasx_extracti128_lo(b->chunks[1]), uint16_t(mask >> 32),
           output + count_ptr[0] + count_ptr[1]);
  compress(lasx_extracti128_hi(b->chunks[1]), uint16_t(mask >> 48),
           output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
  return count_ones(nmask);
}

template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }

inline size_t compress_block_single(block64 *b, uint64_t mask, char *output) {
  const size_t pos64 = trailing_zeroes(mask);
  const int8_t pos = pos64 & 0xf;

  // Predefine the index vector
  const v16u8 v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

  switch (pos64 >> 4) {
  case 0b00: {
    const __m128i lane0 = lasx_extracti128_lo(b->chunks[0]);
    const __m128i lane1 = lasx_extracti128_hi(b->chunks[0]);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); // v1 > v0
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(lane0, lane0, sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 0 * 16), 0);
    __lsx_vst(lane1, reinterpret_cast<__m128i *>(output + 1 * 16 - 1), 0);
    __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output + 2 * 16 - 1),
                0);
  } break;
  case 0b01: {
    const __m128i lane0 = lasx_extracti128_lo(b->chunks[0]);
    const __m128i lane1 = lasx_extracti128_hi(b->chunks[0]);
    __lsx_vst(lane0, reinterpret_cast<__m128i *>(output + 0 * 16), 0);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1);
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(lane1, lane1, sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 1 * 16), 0);
    __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output + 2 * 16 - 1),
                0);
  } break;
  case 0b10: {
    __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output + 0 * 16), 0);

    const __m128i lane2 = lasx_extracti128_lo(b->chunks[1]);
    const __m128i lane3 = lasx_extracti128_hi(b->chunks[1]);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1);
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(lane2, lane2, sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 2 * 16), 0);
    __lsx_vst(lane3, reinterpret_cast<__m128i *>(output + 3 * 16 - 1), 0);
  } break;
  case 0b11: {
    __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output + 0 * 16), 0);
    __lsx_vst(lasx_extracti128_lo(b->chunks[1]),
              reinterpret_cast<__m128i *>(output + 2 * 16), 0);

    const __m128i lane3 = lasx_extracti128_hi(b->chunks[1]);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1);
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(lane3, lane3, sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 3 * 16), 0);
  } break;
  }
  return 63;
}

// The caller of this function is responsible to ensure that there are 64 bytes
// available from reading at src. The data is read into a block64 structure.
static inline void load_block(block64 *b, const char *src) {
  b->chunks[0] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
  b->chunks[1] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
}

// The caller of this function is responsible to ensure that there are 128 bytes
// available from reading at src. The data is read into a block64 structure.
static inline void load_block(block64 *b, const char16_t *src) {
  __m256i m1 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
  __m256i m2 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
  __m256i m3 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 64);
  __m256i m4 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 96);
  b->chunks[0] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m2, m1, 0), 0b11011000);
  b->chunks[1] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m4, m3, 0), 0b11011000);
}

static inline void base64_decode(char *out, __m256i str) {
  __m256i t0 = __lasx_xvor_v(
      __lasx_xvslli_w(str, 26),
      __lasx_xvslli_w(__lasx_xvand_v(str, lasx_splat_u32(0x0000ff00)), 12));
  __m256i t1 =
      __lasx_xvsrli_w(__lasx_xvand_v(str, lasx_splat_u32(0x003f0000)), 2);
  __m256i t2 = __lasx_xvor_v(t0, t1);
  __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16));
  __m256i pack_shuffle = ____m256i(
      (__m128i)v16u8{3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, 0, 0, 0, 0});
  t3 = __lasx_xvshuf_b(t3, t3, (__m256i)pack_shuffle);

  // Store the output:
  __lsx_vst(lasx_extracti128_lo(t3), out, 0);
  __lsx_vst(lasx_extracti128_hi(t3), out, 12);
}
// decode 64 bytes and output 48 bytes
static inline void base64_decode_block(char *out, const char *src) {
  base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
  base64_decode(out + 24,
                __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
}

static inline void base64_decode_block_safe(char *out, const char *src) {
  base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
  alignas(32) char buffer[32];
  base64_decode(buffer,
                __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
  std::memcpy(out + 24, buffer, 24);
}

static inline void base64_decode_block(char *out, block64 *b) {
  base64_decode(out, b->chunks[0]);
  base64_decode(out + 24, b->chunks[1]);
}
static inline void base64_decode_block_safe(char *out, block64 *b) {
  base64_decode(out, b->chunks[0]);
  alignas(32) char buffer[32];
  base64_decode(buffer, b->chunks[1]);
  std::memcpy(out + 24, buffer, 24);
}

template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename chartype>
full_result
compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t equalsigns = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && equalsigns > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0};
    }
    return {SUCCESS, full_input_length, 0};
  }
  char *end_of_safe_64byte_zone =
      (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;

  const chartype *const srcinit = src;
  const char *const dstinit = dst;
  const chartype *const srcend = src + srclen;

  constexpr size_t block_size = 6;
  static_assert(block_size >= 2, "block_size must be at least two");
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const chartype *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b;
      load_block(&b, src);
      src += 64;
      bool error = false;
      uint64_t badcharmask =
          to_base64_mask<base64_url, default_or_url>(&b, &error);
      if (error && !ignore_garbage) {
        src -= 64;
        while (src < srcend && scalar::base64::is_eight_byte(*src) &&
               to_base64[uint8_t(*src)] <= 64) {
          src++;
        }
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      if (badcharmask != 0) {
        if (is_power_of_two(badcharmask)) {
          bufferptr += compress_block_single(&b, badcharmask, bufferptr);
        } else {
          bufferptr += compress_block(&b, badcharmask, bufferptr);
        }
      } else if (bufferptr != buffer) {
        copy_block(&b, bufferptr);
        bufferptr += 64;
      } else {
        if (dst >= end_of_safe_64byte_zone) {
          base64_decode_block_safe(dst, &b);
        } else {
          base64_decode_block(dst, &b);
        }
        dst += 48;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 2); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        if (dst >= end_of_safe_64byte_zone) {
          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
        } else {
          base64_decode_block(dst, buffer + (block_size - 2) * 64);
        }
        dst += 48;
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }

  char *buffer_start = buffer;
  // Optimization note: if this is almost full, then it is worth our
  // time, otherwise, we should just decode directly.
  int last_block = (int)((bufferptr - buffer_start) % 64);
  if (last_block != 0 && srcend - src + last_block >= 64) {

    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
      uint8_t val = to_base64[uint8_t(*src)];
      *bufferptr = char(val);
      if ((!scalar::base64::is_eight_byte(*src) || val > 64) &&
          !ignore_garbage) {
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      bufferptr += (val <= 63);
      src++;
    }
  }

  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    if (dst >= end_of_safe_64byte_zone) {
      base64_decode_block_safe(dst, buffer_start);
    } else {
      base64_decode_block(dst, buffer_start);
    }
    dst += 48;
  }
  if ((bufferptr - buffer_start) % 64 != 0) {
    while (buffer_start + 4 < bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
      // lasx is little-endian
      triple = scalar::u32_swap_bytes(triple);
      std::memcpy(dst, &triple, 4);

      dst += 3;
      buffer_start += 4;
    }
    if (buffer_start + 4 <= bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
      // lasx is little-endian
      triple = scalar::u32_swap_bytes(triple);
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
    // backtrack
    int leftover = int(bufferptr - buffer_start);
    while (leftover > 0) {
      if (!ignore_garbage) {
        while (to_base64[uint8_t(*(src - 1))] == 64) {
          src--;
        }
      } else {
        while (to_base64[uint8_t(*(src - 1))] >= 64) {
          src--;
        }
      }
      src--;
      leftover--;
    }
  }
  if (src < srcend + equalsigns) {
    full_result r = scalar::base64::base64_tail_decode(
        dst, src, srcend - src, equalsigns, options, last_chunk_options);
    r = scalar::base64::patch_tail_result(
        r, size_t(src - srcinit), size_t(dst - dstinit), equallocation,
        full_input_length, last_chunk_options);
    // When is_partial(last_chunk_options) is true, we must either end with
    // the end of the stream (beyond whitespace) or right after a non-ignorable
    // character or at the very beginning of the stream.
    // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
        r.input_count < full_input_length) {
      // First check if we can extend the input to the end of the stream
      while (r.input_count < full_input_length &&
             base64_ignorable(*(srcinit + r.input_count), options)) {
        r.input_count++;
      }
      // If we are still not at the end of the stream, then we must backtrack
      // to the last non-ignorable character.
      if (r.input_count < full_input_length) {
        while (r.input_count > 0 &&
               base64_ignorable(*(srcinit + r.input_count - 1), options)) {
          r.input_count--;
        }
      }
    }
    return r;
  }
  if (equalsigns > 0 && !ignore_garbage) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}
/* end file src/lasx/lasx_base64.cpp */
/* begin file src/lasx/lasx_find.cpp */
simdutf_really_inline const char *util_find(const char *start, const char *end,
                                            char character) noexcept {
  if (start >= end)
    return end;

  const int step = 32;
  __m256i char_vec = __lasx_xvreplgr2vr_b(static_cast<uint16_t>(character));

  while (end - start >= step) {
    __m256i data = __lasx_xvld(reinterpret_cast<const __m256i *>(start), 0);
    __m256i cmp = __lasx_xvseq_b(data, char_vec);
    if (__lasx_xbnz_v(cmp)) {
      __m256i res = __lasx_xvmsknz_b(cmp);
      uint32_t mask0 = __lasx_xvpickve2gr_wu(res, 0);
      uint32_t mask1 = __lasx_xvpickve2gr_wu(res, 4);
      uint32_t mask = (mask0 | (mask1 << 16));
      return start + trailing_zeroes(mask);
    }

    start += step;
  }

  // Handle remaining bytes with scalar loop
  for (; start < end; ++start) {
    if (*start == character) {
      return start;
    }
  }

  return end;
}

simdutf_really_inline const char16_t *util_find(const char16_t *start,
                                                const char16_t *end,
                                                char16_t character) noexcept {
  if (start >= end)
    return end;

  const int step = 16;
  __m256i char_vec = __lasx_xvreplgr2vr_h(static_cast<uint16_t>(character));

  while (end - start >= step) {
    __m256i data = __lasx_xvld(reinterpret_cast<const __m256i *>(start), 0);
    __m256i cmp = __lasx_xvseq_h(data, char_vec);
    if (__lasx_xbnz_v(cmp)) {
      __m256i res = __lasx_xvmsknz_b(cmp);
      uint32_t mask0 = __lasx_xvpickve2gr_wu(res, 0);
      uint32_t mask1 = __lasx_xvpickve2gr_wu(res, 4);
      uint32_t mask = (mask0 | (mask1 << 16));
      return start + trailing_zeroes(mask) / 2;
    }

    start += step;
  }

  // Handle remaining elements with scalar loop
  for (; start < end; ++start) {
    if (*start == character) {
      return start;
    }
  }

  return end;
}
/* end file src/lasx/lasx_find.cpp */

} // namespace
} // namespace lasx
} // namespace simdutf

/* begin file src/generic/buf_block_reader.h */
namespace simdutf {
namespace lasx {
namespace {

// Walks through a buffer in block-sized increments, loading the last part with
// spaces
template <size_t STEP_SIZE> struct buf_block_reader {
public:
  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
  simdutf_really_inline size_t block_index();
  simdutf_really_inline bool has_full_block() const;
  simdutf_really_inline const uint8_t *full_block() const;
  /**
   * Get the last block, padded with spaces.
   *
   * There will always be a last block, with at least 1 byte, unless len == 0
   * (in which case this function fills the buffer with spaces and returns 0. In
   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
   * block with STEP_SIZE bytes and no spaces for padding.
   *
   * @return the number of effective characters in the last block.
   */
  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
  simdutf_really_inline void advance();

private:
  const uint8_t *buf;
  const size_t len;
  const size_t lenminusstep;
  size_t idx;
};

template <size_t STEP_SIZE>
simdutf_really_inline
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
      idx{0} {}

template <size_t STEP_SIZE>
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
  return idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
  return idx < lenminusstep;
}

template <size_t STEP_SIZE>
simdutf_really_inline const uint8_t *
buf_block_reader<STEP_SIZE>::full_block() const {
  return &buf[idx];
}

template <size_t STEP_SIZE>
simdutf_really_inline size_t
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
  if (len == idx) {
    return 0;
  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
  std::memset(dst, 0x20,
              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
                          // to write out 8 or 16 bytes at once.
  std::memcpy(dst, buf + idx, len - idx);
  return len - idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
  idx += STEP_SIZE;
}

} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/buf_block_reader.h */
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8_validation {

using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  static const uint8_t max_array[32] = {255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        0b11110000u - 1,
                                        0b11100000u - 1,
                                        0b11000000u - 1};
  const simd8<uint8_t> max_value(
      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
  return input.gt_bits(max_value);
}

struct utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
  // The last input we received
  simd8<uint8_t> prev_input_block;
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  simd8<uint8_t> prev_incomplete;

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error |= this->prev_incomplete;
  }

  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
    if (simdutf_likely(is_ascii(input))) {
      this->error |= this->prev_incomplete;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
                    "We support either two or four chunks per 64-byte block.");
      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
      }
      this->prev_incomplete =
          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
    }
  }

  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_validation

using utf8_validation::utf8_checker;

} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
/* begin file src/generic/utf8_validation/utf8_validator.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8_validation {

/**
 * Validates that the string is actual UTF-8.
 */
template <class checker>
bool generic_validate_utf8(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  return !c.errors();
}

bool generic_validate_utf8(const char *input, size_t length) {
  return generic_validate_utf8<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

/**
 * Validates that the string is actual UTF-8 and stops on errors.
 */
template <class checker>
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    if (c.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(input),
          reinterpret_cast<const char *>(input + count), length - count);
      res.count += count;
      return res;
    }
    reader.advance();
    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  if (c.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(input),
        reinterpret_cast<const char *>(input) + count, length - count);
    res.count += count;
    return res;
  } else {
    return result(error_code::SUCCESS, length);
  }
}

result generic_validate_utf8_with_errors(const char *input, size_t length) {
  return generic_validate_utf8_with_errors<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

} // namespace utf8_validation
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_validator.h */
/* begin file src/generic/ascii_validation.h */
namespace simdutf {
namespace lasx {
namespace {
namespace ascii_validation {

result generic_validate_ascii_with_errors(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      result res = scalar::ascii::validate_with_errors(
          reinterpret_cast<const char *>(input + count), length - count);
      return result(res.error, count + res.count);
    }
    reader.advance();

    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  if (!in.is_ascii()) {
    result res = scalar::ascii::validate_with_errors(
        reinterpret_cast<const char *>(input + count), length - count);
    return result(res.error, count + res.count);
  } else {
    return result(error_code::SUCCESS, length);
  }
}

bool generic_validate_ascii(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      return false;
    }
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  return in.is_ascii();
}

} // namespace ascii_validation
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/ascii_validation.h */

  // transcoding from UTF-8 to Latin 1
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
  // 0b11000010 and nothing else.
  //
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
  constexpr const uint8_t FORBIDDEN = 0xff;

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      FORBIDDEN,
      // 1110____ ________ <three byte lead in byte 1>
      FORBIDDEN,
      // 1111____ ________ <four+ byte lead in byte 1>
      FORBIDDEN);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              FORBIDDEN,
              // ____0101 ________
              FORBIDDEN,
              // ____011_ ________
              FORBIDDEN, FORBIDDEN,

              // ____1___ ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
              // ____1101 ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    this->error |= check_special_cases(input, prev1);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 16; margin--) {
      leading_byte += (int8_t(in[margin - 1]) >
                       -65); // twos complement of -65 is 1011 1111 ...
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask =
            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                               // this case, we also have ASCII to account for.
        if (utf8_continuation_mask & 1) {
          return 0; // error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
      if (howmany == 0) {
        return 0;
      }
      latin1_output += howmany;
    }
    return latin1_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        if (errors()) {
          // rewind_and_convert_with_errors will seek a potential error from
          // in+pos onward, with the ability to go back up to pos bytes, and
          // read size-pos bytes forward.
          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, latin1_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        latin1_output += res.count;
      }
    }
    return result(error_code::SUCCESS, latin1_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_latin1
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace lasx
} // namespace simdutf
  // namespace simdutf
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
  // transcoding from UTF-8 to UTF-32
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8_to_utf32 {

using namespace simd;

simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char32_t *utf32_output) noexcept {
  size_t pos = 0;
  char32_t *start{utf32_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf32(utf32_output);
      utf32_output += 64;
      pos += 64;
    } else {
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      size_t max_starting_point = (pos + 64) - 12;
      while (pos < max_starting_point) {
        size_t consumed = convert_masked_utf8_to_utf32(
            input + pos, utf8_end_of_code_point_mask, utf32_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
    }
  }
  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
                                                       utf32_output);
  return utf32_output - start;
}

} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8_to_utf32 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (utf8_continuation_mask & 1) {
          return 0; // we have an error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
      if (howmany == 0) {
        return 0;
      }
      utf32_output += howmany;
    }
    return utf32_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (errors() || (utf8_continuation_mask & 1)) {
          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, utf32_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        utf32_output += res.count;
      }
    }
    return result(error_code::SUCCESS, utf32_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */

/* begin file src/generic/utf8.h */
namespace simdutf {
namespace lasx {
namespace {
namespace utf8 {

using namespace simd;

simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.gt(-65);
    count += count_ones(utf8_continuation_mask);
  }
  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
                                                        size_t size) {
  using vector_i8 = simd8<int8_t>;
  using vector_u8 = simd8<uint8_t>;
  using vector_u64 = simd64<uint64_t>;

  constexpr size_t N = vector_i8::SIZE;
  constexpr size_t max_iterations = 255 / 4;

  size_t pos = 0;
  size_t count = 0;

  auto counters = vector_u64::zero();
  auto local = vector_u8::zero();
  size_t iterations = 0;
  for (; pos + 4 * N <= size; pos += 4 * N) {
    const auto input0 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
    const auto input1 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
    const auto input2 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
    const auto input3 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
    const auto mask0 = input0 > int8_t(-65);
    const auto mask1 = input1 > int8_t(-65);
    const auto mask2 = input2 > int8_t(-65);
    const auto mask3 = input3 > int8_t(-65);

    local -= vector_u8(mask0);
    local -= vector_u8(mask1);
    local -= vector_u8(mask2);
    local -= vector_u8(mask3);

    iterations += 1;
    if (iterations == max_iterations) {
      counters += sum_8bytes(local);
      local = vector_u8::zero();
      iterations = 0;
    }
  }

  if (iterations > 0) {
    count += local.sum_bytes();
  }

  count += counters.sum();

  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
    // We count one word for anything that is not a continuation (so
    // leading bytes).
    count += 64 - count_ones(utf8_continuation_mask);
    int64_t utf8_4byte = input.gteq_unsigned(240);
    count += count_ones(utf8_4byte);
  }
  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
}

} // namespace utf8
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf8.h */

/* begin file src/generic/utf32.h */
#include <limits>

namespace simdutf {
namespace lasx {
namespace {
namespace utf32 {

template <typename T> T min(T a, T b) { return a <= b ? a : b; }

simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
                                                    size_t length) {
  using vector_u32 = simd32<uint32_t>;

  const char32_t *start = input;

  // we add up to three ones in a single iteration (see the vectorized loop in
  // section #2 below)
  const size_t max_increment = 3;

  const size_t N = vector_u32::ELEMENTS;

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
  const auto v_0000007f = vector_u32::splat(0x0000007f);
  const auto v_000007ff = vector_u32::splat(0x000007ff);
  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
#else
  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
  const auto v_fffff800 = vector_u32::splat(0xfffff800);
  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto one = vector_u32::splat(1);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

  size_t counter = 0;

  // 1. vectorized loop unrolled 4 times
  {
    // we use vector of uint32 counters, this is why this limit is used
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
    size_t blocks = length / (N * 4);
    length -= blocks * (N * 4);
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      simd32<uint32_t> acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in0 = vector_u32(input + 0 * N);
        const auto in1 = vector_u32(input + 1 * N);
        const auto in2 = vector_u32(input + 2 * N);
        const auto in3 = vector_u32(input + 3 * N);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in0 > v_0000007f);
        acc -= as_vector_u32(in1 > v_0000007f);
        acc -= as_vector_u32(in2 > v_0000007f);
        acc -= as_vector_u32(in3 > v_0000007f);

        acc -= as_vector_u32(in0 > v_000007ff);
        acc -= as_vector_u32(in1 > v_000007ff);
        acc -= as_vector_u32(in2 > v_000007ff);
        acc -= as_vector_u32(in3 > v_000007ff);

        acc -= as_vector_u32(in0 > v_0000ffff);
        acc -= as_vector_u32(in1 > v_0000ffff);
        acc -= as_vector_u32(in2 > v_0000ffff);
        acc -= as_vector_u32(in3 > v_0000ffff);
#else
        acc += min(one, in0 & v_ffffff80);
        acc += min(one, in1 & v_ffffff80);
        acc += min(one, in2 & v_ffffff80);
        acc += min(one, in3 & v_ffffff80);

        acc += min(one, in0 & v_fffff800);
        acc += min(one, in1 & v_fffff800);
        acc += min(one, in2 & v_fffff800);
        acc += min(one, in3 & v_fffff800);

        acc += min(one, in0 & v_ffff0000);
        acc += min(one, in1 & v_ffff0000);
        acc += min(one, in2 & v_ffff0000);
        acc += min(one, in3 & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += 4 * N;
      }

      counter += acc.sum();
    }
  }

  // 2. vectorized loop for tail
  {
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / max_increment;
    size_t blocks = length / N;
    length -= blocks * N;
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      auto acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in = vector_u32(input);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in > v_0000007f);
        acc -= as_vector_u32(in > v_000007ff);
        acc -= as_vector_u32(in > v_0000ffff);
#else
        acc += min(one, in & v_ffffff80);
        acc += min(one, in & v_fffff800);
        acc += min(one, in & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += N;
      }

      counter += acc.sum();
    }
  }

  const size_t consumed = input - start;
  if (consumed != 0) {
    // We don't count 0th bytes in the vectorized loops above, this
    // is why we need to count them in the end.
    counter += consumed;
  }

  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
}

} // namespace utf32
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/utf32.h */
/* begin file src/generic/base64lengths.h */
namespace simdutf {
namespace lasx {
namespace {
namespace base64_lengths {

simdutf_warn_unused size_t binary_length_from_base64(const char *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= length; pos += 64) {
    simd8x64<uint8_t> block(reinterpret_cast<const uint8_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 32 <= length; pos += 32) {
    simd16x32<uint16_t> block(reinterpret_cast<const uint16_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char16_t c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

} // namespace base64_lengths
} // unnamed namespace
} // namespace lasx
} // namespace simdutf
/* end file src/generic/base64lengths.h */

//
// Implementation-specific overrides
//
namespace simdutf {
namespace lasx {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return lasx::utf8_validation::generic_validate_utf8(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return lasx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return lasx::ascii_validation::generic_validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return lasx::ascii_validation::generic_validate_ascii_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    // empty input is valid. protected the implementation from nullptr.
    return true;
  }
  const char32_t *tail = lasx_validate_utf32le(buf, len);
  if (tail) {
    return scalar::utf32::validate(tail, len - (tail - buf));
  } else {
    return false;
  }
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, 0);
  }
  result res = lasx_validate_utf32le_with_errors(buf, len);
  if (res.count != len) {
    result scalar_res =
        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
    return result(scalar_res.error, res.count + scalar_res.count);
  } else {
    return res;
  }
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char *, char *> ret =
      lasx_convert_latin1_to_utf8(buf, len, utf8_output);
  size_t converted_chars = ret.second - utf8_output;

  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  std::pair<const char *, char32_t *> ret =
      lasx_convert_latin1_to_utf32(buf, len, utf32_output);
  size_t converted_chars = ret.second - utf32_output;
  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  size_t pos = 0;
  char *output_start{latin1_output};
  // Performance degradation when memory address is not 32-byte aligned
  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
    if (buf[pos] & 0x80) {
      if (pos + 1 >= len)
        return 0;
      if ((buf[pos] & 0b11100000) == 0b11000000) {
        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
          return 0;
        uint32_t code_point =
            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
        if (code_point < 0x80 || 0xFF < code_point) {
          return 0;
        }
        *latin1_output++ = char(code_point);
        pos += 2;
      } else {
        return 0;
      }
    } else {
      *latin1_output++ = char(buf[pos]);
      pos++;
    }
  }
  size_t convert_size = latin1_output - output_start;
  if (pos == len)
    return convert_size;
  utf8_to_latin1::validating_transcoder converter;
  size_t convert_result =
      converter.convert(buf + pos, len - pos, latin1_output);
  return convert_result ? convert_size + convert_result : 0;
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  size_t pos = 0;
  char *output_start{latin1_output};
  // Performance degradation when memory address is not 32-byte aligned
  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
    if (buf[pos] & 0x80) {
      if ((buf[pos] & 0b11100000) == 0b11000000) {
        if (pos + 1 >= len)
          return result(error_code::TOO_SHORT, pos);
        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
          return result(error_code::TOO_SHORT, pos);
        uint32_t code_point =
            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
        if (code_point < 0x80)
          return result(error_code::OVERLONG, pos);
        if (0xFF < code_point)
          return result(error_code::TOO_LARGE, pos);
        *latin1_output++ = char(code_point);
        pos += 2;
      } else if ((buf[pos] & 0b11110000) == 0b11100000) {
        return result(error_code::TOO_LARGE, pos);
      } else if ((buf[pos] & 0b11111000) == 0b11110000) {
        return result(error_code::TOO_LARGE, pos);
      } else {
        if ((buf[pos] & 0b11000000) == 0b10000000) {
          return result(error_code::TOO_LONG, pos);
        }
        return result(error_code::HEADER_BITS, pos);
      }
    } else {
      *latin1_output++ = char(buf[pos]);
      pos++;
    }
  }
  size_t convert_size = latin1_output - output_start;
  if (pos == len)
    return result(error_code::SUCCESS, convert_size);

  utf8_to_latin1::validating_transcoder converter;
  result res =
      converter.convert_with_errors(buf + pos, len - pos, latin1_output);
  return res.error ? result(res.error, res.count + pos)
                   : result(res.error, res.count + convert_size);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  size_t pos = 0;
  char *output_start{latin1_output};
  // Performance degradation when memory address is not 32-byte aligned
  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
    if (buf[pos] & 0x80) {
      if (pos + 1 >= len)
        break;
      if ((buf[pos] & 0b11100000) == 0b11000000) {
        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
          return 0;
        uint32_t code_point =
            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
        *latin1_output++ = char(code_point);
        pos += 2;
      } else {
        return 0;
      }
    } else {
      *latin1_output++ = char(buf[pos]);
      pos++;
    }
  }
  size_t convert_size = latin1_output - output_start;
  if (pos == len)
    return convert_size;

  size_t convert_result =
      lasx::utf8_to_latin1::convert_valid(buf + pos, len - pos, latin1_output);
  return convert_result ? convert_size + convert_result : 0;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return 0;
  }
  std::pair<const char32_t *, char *> ret =
      lasx_convert_utf32_to_utf8(buf, len, utf8_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - utf8_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, 0);
  }
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      lasx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      utf8_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      lasx_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;

  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<result, char *> ret =
      lasx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
  if (ret.first.error) {
    return ret.first;
  } // Can return directly since scalar fallback already found correct
    // ret.first.count
  if (ret.first.count != len) { // All good so far, but not finished
    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      latin1_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      lasx_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;

  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
        ret.first, len - (ret.first - buf), ret.second);
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  // optimization opportunity: implement a custom function.
  return convert_utf32_to_utf8(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  size_t pos = 0;
  size_t count = 0;
  // Performance degradation when memory address is not 32-byte aligned
  while ((((uint64_t)input + pos) & 0x1F && pos < length)) {
    if (input[pos++] > -65) {
      count++;
    }
  }
  __m256i v_bf = __lasx_xvldi(0xBF); // 0b10111111
  for (; pos + 32 <= length; pos += 32) {
    __m256i in = __lasx_xvld(reinterpret_cast<const int8_t *>(input + pos), 0);
    __m256i utf8_count =
        __lasx_xvpcnt_h(__lasx_xvmskltz_b(__lasx_xvslt_b(v_bf, in)));
    count = count + __lasx_xvpickve2gr_wu(utf8_count, 0) +
            __lasx_xvpickve2gr_wu(utf8_count, 4);
  }
  return count + scalar::utf8::count_code_points(input + pos, length - pos);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t length) const noexcept {
  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
  const uint8_t *data_end = data + length;
  uint64_t result = 0;
  while (data_end - data > 16) {
    uint64_t two_bytes = 0;
    __m128i input_vec = __lsx_vld(data, 0);
    two_bytes =
        __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
    result += 16 + two_bytes;
    data += 16;
  }
  return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
                                                          data_end - data);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64<true>(output, input, length, options);
  } else {
    return encode_base64<false>(output, input, length, options);
  }
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64_impl<true>(output, input, length,
                                                       options, line_length);
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util_find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util_find(start, end, character);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

} // namespace lasx
} // namespace simdutf

/* begin file src/simdutf/lasx/end.h */
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP

#if SIMDUTF_CAN_ALWAYS_RUN_LASX
// nothing needed.
#else
SIMDUTF_UNTARGET_REGION
#endif
/* end file src/simdutf/lasx/end.h */
/* end file src/lasx/implementation.cpp */
#endif
#if SIMDUTF_IMPLEMENTATION_LSX
/* begin file src/lsx/implementation.cpp */
/* begin file src/simdutf/lsx/begin.h */
// redefining SIMDUTF_IMPLEMENTATION to "lsx"
// #define SIMDUTF_IMPLEMENTATION lsx
#define SIMDUTF_SIMD_HAS_UNSIGNED_CMP 1
/* end file src/simdutf/lsx/begin.h */
namespace simdutf {
namespace lsx {
namespace {
#ifndef SIMDUTF_LSX_H
  #error "lsx.h must be included"
#endif
using namespace simd;

// convert vmskltz/vmskgez/vmsknz to
// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
const uint8_t lsx_1_2_utf8_bytes_mask[] = {
    0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
    85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
    86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
    89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
    90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
    101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
    102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
    105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
    106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
    149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
    150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
    153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
    154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
    165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
    166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
    169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
    170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
    255};

simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
  return __lsx_vshuf4i_b(vec, 0b10110001);
}

simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
  return input.is_ascii();
}

simdutf_really_inline simd8<bool>
must_be_2_3_continuation(const simd8<uint8_t> prev2,
                         const simd8<uint8_t> prev3) {
  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
  return is_third_byte ^ is_fourth_byte;
}

// common functions for utf8 conversions
simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
  // Low half contains  10bbbbbb|10cccccc
  // High half contains 1110aaaa|1110aaaa
  const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
  const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};

  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
  // 1110aaaa => aaaa0000
  __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
  // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
                                     perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
  // 0010bbbb bbcccccc => aaaabbbb bbcccccc
  composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);

  return composed;
}

simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
  // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
  __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
  // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
  composed = __lsx_vbitsel_v(
      __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
      __lsx_vsrli_h(composed, 8),                   /* bbbbbb >> 8 */
      __lsx_vrepli_h(0x3f));                        /* 0x003f */
  return composed;
}

simdutf_really_inline __m128i
convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
  // This is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes.
  __m128i sh =
      __lsx_vld(reinterpret_cast<const uint8_t *>(
                    simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
                0);
  // Shuffle
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 110aaaaa 10bbbbbb
  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000000 00bbbbbb
  __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
  // 1 byte: 00000000 00000000
  // 2 byte: 00000aaa aa000000
  const __m128i v1f00 = lsx_splat_u16(0x1f00);
  __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
  // Combine with a shift right accumulate
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 00000aaa aabbbbbb
  composed = __lsx_vadd_h(ascii, composed);
  return composed;
}

/* begin file src/lsx/lsx_validate_utf32le.cpp */
const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) {
  const char32_t *end = input + size;

  __m128i offset = lsx_splat_u32(0xffff2000);
  __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff);
  __m128i standardmax = lsx_splat_u32(0x10ffff);
  __m128i currentmax = lsx_splat_u32(0);
  __m128i currentoffsetmax = lsx_splat_u32(0);

  while (input + 4 < end) {
    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
    currentmax = __lsx_vmax_wu(in, currentmax);
    // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
    currentoffsetmax =
        __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);

    input += 4;
  }

  __m128i is_zero =
      __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
  if (__lsx_bnz_v(is_zero)) {
    return nullptr;
  }

  is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
                         standardoffsetmax);
  if (__lsx_bnz_v(is_zero)) {
    return nullptr;
  }

  return input;
}

const result lsx_validate_utf32le_with_errors(const char32_t *input,
                                              size_t size) {
  const char32_t *start = input;
  const char32_t *end = input + size;

  __m128i offset = lsx_splat_u32(0xffff2000);
  __m128i standardoffsetmax = lsx_splat_u32(0xfffff7ff);
  __m128i standardmax = lsx_splat_u32(0x10ffff);
  __m128i currentmax = lsx_splat_u32(0);
  __m128i currentoffsetmax = lsx_splat_u32(0);

  while (input + 4 < end) {
    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
    currentmax = __lsx_vmax_wu(in, currentmax);
    currentoffsetmax =
        __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);

    __m128i is_zero =
        __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
    if (__lsx_bnz_v(is_zero)) {
      return result(error_code::TOO_LARGE, input - start);
    }

    is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
                           standardoffsetmax);
    if (__lsx_bnz_v(is_zero)) {
      return result(error_code::SURROGATE, input - start);
    }

    input += 4;
  }

  return result(error_code::SUCCESS, input - start);
}
/* end file src/lsx/lsx_validate_utf32le.cpp */

/* begin file src/lsx/lsx_convert_latin1_to_utf8.cpp */
/*
  Returns a pair: the first unprocessed byte from buf and utf8_output
  A scalar routing should carry on the conversion of the tail.
*/

std::pair<const char *, char *>
lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
                           char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char *end = latin1_input + len;

  __m128i zero = __lsx_vldi(0);
  // We always write 16 bytes, of which more than the first 8 bytes
  // are valid. A safety margin of 8 is more than sufficient.
  while (end - latin1_input >= 16) {
    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
    uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0);
    if (ascii == 0xffff) { // ASCII fast path!!!!
      __lsx_vst(in8, utf8_output, 0);
      utf8_output += 16;
      latin1_input += 16;
      continue;
    }
    // We just fallback on UTF-16 code. This could be optimized/simplified
    // further.
    __m128i in16 = __lsx_vilvl_b(zero, in8);
    // 1. prepare 2-byte values
    // input 8-bit word : [aabb|bbbb] x 8
    // expected output   : [1100|00aa|10bb|bbbb] x 8
    // t0 = [0000|00aa|bbbb|bb00]
    __m128i t0 = __lsx_vslli_h(in16, 2);
    // t1 = [0000|00aa|0000|0000]
    __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x300));
    // t3 = [0000|00aa|00bb|bbbb]
    __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f));
    // t4 = [1100|00aa|10bb|bbbb]
    __m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080)));
    // merge ASCII and 2-byte codewords
    __m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F));
    __m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask);

    const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                             [lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0];
    __m128i shuffle = __lsx_vld(row + 1, 0);
    __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);

    // store bytes
    __lsx_vst(utf8_packed, utf8_output, 0);
    // adjust pointers
    latin1_input += 8;
    utf8_output += row[0];

  } // while

  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
}
/* end file src/lsx/lsx_convert_latin1_to_utf8.cpp */
/* begin file src/lsx/lsx_convert_latin1_to_utf32.cpp */
std::pair<const char *, char32_t *>
lsx_convert_latin1_to_utf32(const char *buf, size_t len,
                            char32_t *utf32_output) {
  const char *end = buf + len;

  while (end - buf >= 16) {
    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);

    __m128i zero = __lsx_vldi(0);
    __m128i in16low = __lsx_vilvl_b(zero, in8);
    __m128i in16high = __lsx_vilvh_b(zero, in8);
    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);

    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output + 4), 0);
    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output + 8), 0);
    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output + 12), 0);

    utf32_output += 16;
    buf += 16;
  }

  return std::make_pair(buf, utf32_output);
}
/* end file src/lsx/lsx_convert_latin1_to_utf32.cpp */

/* begin file src/lsx/lsx_convert_utf8_to_utf32.cpp */
// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
// end of the code points. Only the least significant 12 bits of the mask
// are accessed.
// It returns how many bytes were consumed (up to 12).
size_t convert_masked_utf8_to_utf32(const char *input,
                                    uint64_t utf8_end_of_code_point_mask,
                                    char32_t *&utf32_out) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xFFF;
  //
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.
  //
  // We first try a few fast paths.
  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
    // We process in chunks of 16 bytes.
    // use fast implementation in src/simdutf/arm64/simd.h
    // Ideally the compiler can keep the tables in registers.
    simd8<int8_t> temp{in};
    temp.store_ascii_as_utf32_tbl(utf32_out);
    utf32_output += 16; // We wrote 16 32-bit characters.
    return 16;          // We consumed 16 bytes.
  }
  __m128i zero = __lsx_vldi(0);
  if (input_utf8_end_of_code_point_mask == 0x924) {
    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
    // UTF-32 code units. Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return 12;         // We consumed 12 bytes.
  }
  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
  if (input_utf8_end_of_code_point_mask == 0xaaa) {
    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
    // UTF-32 code units. Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);

    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
    utf32_output += 6;
    return 12; // We consumed 12 bytes.
  }
  /// Either no fast path or an unimportant fast path.

  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];
  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];

  if (idx < 64) {
    // SIX (6) input code-code units
    // Convert to UTF-16
    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);

    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
    utf32_output += 6;
    return consumed;
  } else if (idx < 145) {
    // FOUR (4) input code-code units
    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                           0);
    // Shuffle
    // 1 byte: 00000000 00000000 0ccccccc
    // 2 byte: 00000000 110bbbbb 10cccccc
    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
    __m128i perm = __lsx_vshuf_b(zero, in, sh);
    // Split
    // 00000000 00000000 0ccccccc
    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
    // Note: unmasked
    // xxxxxxxx aaaaxxxx xxxxxxxx
    __m128i high =
        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
    // Use 16 bit bic instead of and.
    // The top bits will be corrected later in the bsl
    // 00000000 10bbbbbb 00000000
    __m128i middle =
        __lsx_vand_v(perm, lsx_splat_u32(0x0000FF00)); // 5 or 6 bits
    // Combine low and middle with shift right accumulate
    // 00000000 00xxbbbb bbcccccc
    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
    // Insert top 4 bits from high byte with bitwise select
    // 00000000 aaaabbbb bbcccccc
    __m128i composed = __lsx_vbitsel_v(lowmid, high, lsx_splat_u32(0x0000F000));
    __lsx_vst(composed, utf32_output, 0);
    utf32_output += 4; // We wrote 4 32-bit characters.
    return consumed;
  } else if (idx < 209) {
    // THREE (3) input code-code units
    if (input_utf8_end_of_code_point_mask == 0x888) {
      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
      // UTF-32 code units. This uses the same method as the fixed 3 byte
      // version, reversing and shift left insert. However, there is no need for
      // a shuffle mask now, just rev16 and rev32.
      //
      // This version does not use the LUT, but 4 byte sequences are less common
      // and the overhead of the extra memory access is less important than the
      // early branch overhead in shorter sequences, so it comes last.

      // Swap pairs of bytes
      // 10dddddd|10cccccc|10bbbbbb|11110aaa
      // 10cccccc 10dddddd|11110aaa 10bbbbbb
      __m128i swap = lsx_swap_bytes(in);
      // Shift left and insert
      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
                                       __lsx_vrepli_h(0x3f /*0x003F*/));
      // Shift insert again
      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
      __m128i merge2 =
          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
                          lsx_splat_u32(0x00000FFF));
      // Clear the garbage
      // 00000000 000aaabb bbbbcccc ccdddddd
      __m128i composed = __lsx_vand_v(merge2, lsx_splat_u32(0x1FFFFF));
      // Store
      __lsx_vst(composed, utf32_output, 0);
      utf32_output += 3; // We wrote 3 32-bit characters.
      return 12;         // We consumed 12 bytes.
    }
    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
    // due to surrogates no longer being involved.
    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                           0);
    // 1 byte: 00000000 00000000 00000000 0ddddddd
    // 2 byte: 00000000 00000000 110ccccc 10dddddd
    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
    __m128i perm = __lsx_vshuf_b(zero, in, sh);

    // Ascii
    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
    __m128i middle = __lsx_vand_v(perm, lsx_splat_u32(0x00003f00));
    // 00000000 00000000 0000cccc ccdddddd
    __m128i cd = __lsx_vor_v(__lsx_vsrli_w(middle, 2), ascii);

    __m128i correction = __lsx_vand_v(perm, lsx_splat_u32(0x00400000));
    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
    // Insert twice
    // 00000000 000aaabb bbbbxxxx xxxxxxxx
    __m128i corrected_srli2 =
        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
    __m128i ab =
        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
    ab = __lsx_vsrli_w(ab, 4);
    // 00000000 000aaabb bbbbcccc ccdddddd
    __m128i composed = __lsx_vbitsel_v(ab, cd, lsx_splat_u32(0x00000FFF));
    // Store
    __lsx_vst(composed, utf32_output, 0);
    utf32_output += 3; // We wrote 3 32-bit characters.
    return consumed;
  } else {
    // here we know that there is an error but we do not handle errors
    return 12;
  }
}
/* end file src/lsx/lsx_convert_utf8_to_utf32.cpp */
/* begin file src/lsx/lsx_convert_utf8_to_latin1.cpp */
size_t convert_masked_utf8_to_latin1(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
                                     char *&latin1_output) {
  // we use an approach where we try to process up to 12 input bytes.
  // Why 12 input bytes and not 16? Because we are concerned with the size of
  // the lookup tables. Also 12 is nicely divisible by two and three.
  //
  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);

  const uint16_t input_utf8_end_of_code_point_mask =
      utf8_end_of_code_point_mask & 0xfff;
  // Optimization note: our main path below is load-latency dependent. Thus it
  // is maybe beneficial to have fast paths that depend on branch prediction but
  // have less latency. This results in more instructions but, potentially, also
  // higher speeds.

  // We first try a few fast paths.
  // The obvious first test is ASCII, which actually consumes the full 16.
  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
    // We process in chunks of 16 bytes
    __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
    latin1_output += 16; // We wrote 16 18-bit characters.
    return 16;           // We consumed 16 bytes.
  }
  /// We do not have a fast path available, or the fast path is unimportant, so
  /// we fallback.
  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][0];

  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
      [input_utf8_end_of_code_point_mask][1];
  // this indicates an invalid input:
  if (idx >= 64) {
    return consumed;
  }
  // Here we should have (idx < 64), if not, there is a bug in the validation or
  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
  // we process SIX (6) input code-code units. The max length in bytes of six
  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
  // scenario we process SIX (6) input code-code units. The max length in bytes
  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
  __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
                             simdutf::tables::utf8_to_utf16::shufutf8[idx]),
                         0);
  // Shuffle
  // 1 byte: 00000000 0bbbbbbb
  // 2 byte: 110aaaaa 10bbbbbb
  sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
  // ascii mask
  // 1 byte: 11111111 11111111
  // 2 byte: 00000000 00000000
  __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
  // utf8 mask
  // 1 byte: 00000000 00000000
  // 2 byte: 00111111 00111111
  __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
                                   __lsx_vldi(0b00111111));
  // mask
  //  1 byte: 11111111 11111111
  //  2 byte: 00111111 00111111
  __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);

  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
  // writing 8 bytes even though we only care about the first 6 bytes.
  __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);

  uint64_t buffer[2];
  // __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
  __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(buffer), 0);
  std::memcpy(latin1_output, buffer, 6);
  latin1_output += 6; // We wrote 6 bytes.
  return consumed;
}
/* end file src/lsx/lsx_convert_utf8_to_latin1.cpp */

/* begin file src/lsx/lsx_convert_utf32_to_latin1.cpp */
std::pair<const char32_t *, char *>
lsx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
                            char *latin1_output) {
  const char32_t *end = buf + len;
  const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
  __m128i v_ff = __lsx_vrepli_w(0xFF);

  while (end - buf >= 16) {
    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);

    __m128i in12 = __lsx_vor_v(in1, in2);
    if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
      // 1. pack the bytes
      __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
      // 2. store (8 bytes)
      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
      // 3. adjust pointers
      buf += 8;
      latin1_output += 8;
    } else {
      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
    }
  } // while
  return std::make_pair(buf, latin1_output);
}

std::pair<result, char *>
lsx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
                                        char *latin1_output) {
  const char32_t *start = buf;
  const char32_t *end = buf + len;

  const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
  __m128i v_ff = __lsx_vrepli_w(0xFF);

  while (end - buf >= 16) {
    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);

    __m128i in12 = __lsx_vor_v(in1, in2);

    if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
      // 1. pack the bytes
      __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
      // 2. store (8 bytes)
      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
      // 3. adjust pointers
      buf += 8;
      latin1_output += 8;
    } else {
      // Let us do a scalar fallback.
      for (int k = 0; k < 8; k++) {
        uint32_t word = buf[k];
        if (word <= 0xff) {
          *latin1_output++ = char(word);
        } else {
          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                latin1_output);
        }
      }
    }
  } // while
  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        latin1_output);
}
/* end file src/lsx/lsx_convert_utf32_to_latin1.cpp */
/* begin file src/lsx/lsx_convert_utf32_to_utf8.cpp */
std::pair<const char32_t *, char *>
lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char32_t *end = buf + len;

  __m128i v_c080 = lsx_splat_u16(0xc080);
  __m128i v_07ff = lsx_splat_u16(0x07ff);
  __m128i v_dfff = lsx_splat_u16(0xdfff);
  __m128i v_d800 = lsx_splat_u16(0xd800);
  __m128i forbidden_bytemask = __lsx_vldi(0x0);

  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);

    // Check if no bits set above 16th
    if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
      // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
      __m128i utf16_packed = __lsx_vpickev_h(nextin, in);

      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
                                   utf16_packed))) { // ASCII fast path!!!!
        // 1. pack the bytes
        // obviously suboptimal.
        __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
        // 2. store (8 bytes)
        __lsx_vst(utf8_packed, utf8_output, 0);
        // 3. adjust pointers
        buf += 8;
        utf8_output += 8;
        continue; // we are done for this round!
      }
      __m128i zero = __lsx_vldi(0);
      if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
        // 1. prepare 2-byte values
        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
        // expected output   : [110a|aaaa|10bb|bbbb] x 8

        // t0 = [000a|aaaa|bbbb|bb00]
        const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
        // t1 = [000a|aaaa|0000|0000]
        const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
        // t2 = [0000|0000|00bb|bbbb]
        const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
        // t3 = [000a|aaaa|00bb|bbbb]
        const __m128i t3 = __lsx_vor_v(t1, t2);
        // t4 = [110a|aaaa|10bb|bbbb]
        const __m128i t4 = __lsx_vor_v(t3, v_c080);
        // 2. merge ASCII and 2-byte codewords
        __m128i one_byte_bytemask =
            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
        __m128i utf8_unpacked =
            __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
        // 3. prepare bitmask for 8-bit lookup
        uint32_t m2 =
            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
        // 4. pack the bytes
        const uint8_t *row =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                [lsx_1_2_utf8_bytes_mask[m2]][0];
        __m128i shuffle = __lsx_vld(row, 1);
        __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
        // 5. store bytes
        __lsx_vst(utf8_packed, utf8_output, 0);

        // 6. adjust pointers
        buf += 8;
        utf8_output += row[0];
        continue;
      } else {
        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
        forbidden_bytemask = __lsx_vor_v(
            __lsx_vand_v(
                __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
                __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
            forbidden_bytemask);
        /* In this branch we handle three cases:
    1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single
    UFT-8 byte
    2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
    UTF-8 bytes
    3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
    UTF-8 bytes

    We expand the input word (16-bit) into two code units (32-bit), thus
    we have room for four bytes. However, we need five distinct bit
    layouts. Note that the last byte in cases #2 and #3 is the same.

    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
    in register t2.

    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
    either byte 1 for case #2 or byte 2 for case #3. Note that they
    differ by exactly one bit.

    Finally from these two code units we build proper UTF-8 sequence, taking
    into account the case (i.e, the number of bytes to write).
  */
        /**
         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
         * t2 => [0ccc|cccc] [10cc|cccc]
         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
         */
        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
        __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
        t0 = __lsx_vilvl_b(t0, t0);
        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
        __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
        __m128i t1 = __lsx_vand_v(t0, v_3f7f);
        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
        __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));

        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
        __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
        __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
        s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00));
        // [00bb|bbbb|0000|aaaa]
        __m128i s2 = __lsx_vor_v(s0, s1);
        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
        __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
        __m128i s3 = __lsx_vor_v(s2, v_c0e0);
        __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
        __m128i m0 =
            __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
        __m128i s4 = __lsx_vxor_v(s3, m0);

        // 4. expand code units 16-bit => 32-bit
        __m128i out0 = __lsx_vilvl_h(s4, t2);
        __m128i out1 = __lsx_vilvh_h(s4, t2);

        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
        __m128i one_byte_bytemask =
            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));

        __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
            __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
        __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
            __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);

        __m128i one_byte_bytemask_u16_to_u32_low =
            __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
        __m128i one_byte_bytemask_u16_to_u32_high =
            __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);

        const uint32_t mask0 =
            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
                                    one_or_two_bytes_bytemask_u16_to_u32_low,
                                    one_byte_bytemask_u16_to_u32_low)),
                                0);
        const uint32_t mask1 =
            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
                                    one_or_two_bytes_bytemask_u16_to_u32_high,
                                    one_byte_bytemask_u16_to_u32_high)),
                                0);

        const uint8_t *row0 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
        __m128i shuffle0 = __lsx_vld(row0, 1);
        __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);

        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
        __m128i shuffle1 = __lsx_vld(row1, 1);
        __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);

        __lsx_vst(utf8_0, utf8_output, 0);
        utf8_output += row0[0];
        __lsx_vst(utf8_1, utf8_output, 0);
        utf8_output += row1[0];

        buf += 8;
      }
      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes.
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(nullptr,
                                  reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(nullptr,
                                  reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  // check for invalid input
  if (__lsx_bnz_v(forbidden_bytemask)) {
    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
  }

  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
}

std::pair<result, char *>
lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
                                      char *utf8_out) {
  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
  const char32_t *start = buf;
  const char32_t *end = buf + len;

  __m128i v_c080 = lsx_splat_u16(0xc080);
  __m128i v_07ff = lsx_splat_u16(0x07ff);
  __m128i v_dfff = lsx_splat_u16(0xdfff);
  __m128i v_d800 = lsx_splat_u16(0xd800);
  __m128i forbidden_bytemask = __lsx_vldi(0x0);
  const size_t safety_margin =
      12; // to avoid overruns, see issue
          // https://github.com/simdutf/simdutf/issues/92

  while (end - buf > std::ptrdiff_t(16 + safety_margin)) {
    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
    __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);

    // Check if no bits set above 16th
    if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
      // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
      __m128i utf16_packed = __lsx_vpickev_h(nextin, in);

      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
                                   utf16_packed))) { // ASCII fast path!!!!
        // 1. pack the bytes
        // obviously suboptimal.
        __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
        // 2. store (8 bytes)
        __lsx_vst(utf8_packed, utf8_output, 0);
        // 3. adjust pointers
        buf += 8;
        utf8_output += 8;
        continue; // we are done for this round!
      }
      __m128i zero = __lsx_vldi(0);
      if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
        // 1. prepare 2-byte values
        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
        // expected output   : [110a|aaaa|10bb|bbbb] x 8

        // t0 = [000a|aaaa|bbbb|bb00]
        const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
        // t1 = [000a|aaaa|0000|0000]
        const __m128i t1 = __lsx_vand_v(t0, lsx_splat_u16(0x1f00));
        // t2 = [0000|0000|00bb|bbbb]
        const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
        // t3 = [000a|aaaa|00bb|bbbb]
        const __m128i t3 = __lsx_vor_v(t1, t2);
        // t4 = [110a|aaaa|10bb|bbbb]
        const __m128i t4 = __lsx_vor_v(t3, v_c080);
        // 2. merge ASCII and 2-byte codewords
        __m128i one_byte_bytemask =
            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
        __m128i utf8_unpacked =
            __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
        // 3. prepare bitmask for 8-bit lookup
        uint32_t m2 =
            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
        // 4. pack the bytes
        const uint8_t *row =
            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
                [lsx_1_2_utf8_bytes_mask[m2]][0];
        __m128i shuffle = __lsx_vld(row, 1);
        __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
        // 5. store bytes
        __lsx_vst(utf8_packed, utf8_output, 0);

        // 6. adjust pointers
        buf += 8;
        utf8_output += row[0];
        continue;
      } else {
        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
        forbidden_bytemask = __lsx_vor_v(
            __lsx_vand_v(
                __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
                __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
            forbidden_bytemask);
        if (__lsx_bnz_v(forbidden_bytemask)) {
          return std::make_pair(result(error_code::SURROGATE, buf - start),
                                reinterpret_cast<char *>(utf8_output));
        }
        /* In this branch we handle three cases:
    1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single
    UFT-8 byte
    2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
    UTF-8 bytes
    3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
    UTF-8 bytes

    We expand the input word (16-bit) into two code units (32-bit), thus
    we have room for four bytes. However, we need five distinct bit
    layouts. Note that the last byte in cases #2 and #3 is the same.

    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
    in register t2.

    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
    either byte 1 for case #2 or byte 2 for case #3. Note that they
    differ by exactly one bit.

    Finally from these two code units we build proper UTF-8 sequence, taking
    into account the case (i.e, the number of bytes to write).
  */
        /**
         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
         * t2 => [0ccc|cccc] [10cc|cccc]
         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
         */
        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
        __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
        t0 = __lsx_vilvl_b(t0, t0);
        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
        __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
        __m128i t1 = __lsx_vand_v(t0, v_3f7f);
        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
        __m128i t2 = __lsx_vor_v(t1, lsx_splat_u16(0x8000));

        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
        __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
        __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
        s1 = __lsx_vand_v(s1, lsx_splat_u16(0x3F00));
        // [00bb|bbbb|0000|aaaa]
        __m128i s2 = __lsx_vor_v(s0, s1);
        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
        __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
        __m128i s3 = __lsx_vor_v(s2, v_c0e0);
        // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
        __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
        __m128i m0 =
            __lsx_vandn_v(one_or_two_bytes_bytemask, lsx_splat_u16(0x4000));
        __m128i s4 = __lsx_vxor_v(s3, m0);

        // 4. expand code units 16-bit => 32-bit
        __m128i out0 = __lsx_vilvl_h(s4, t2);
        __m128i out1 = __lsx_vilvh_h(s4, t2);

        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
        __m128i one_byte_bytemask =
            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));

        __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
            __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
        __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
            __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);

        __m128i one_byte_bytemask_u16_to_u32_low =
            __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
        __m128i one_byte_bytemask_u16_to_u32_high =
            __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);

        const uint32_t mask0 =
            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
                                    one_or_two_bytes_bytemask_u16_to_u32_low,
                                    one_byte_bytemask_u16_to_u32_low)),
                                0);
        const uint32_t mask1 =
            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
                                    one_or_two_bytes_bytemask_u16_to_u32_high,
                                    one_byte_bytemask_u16_to_u32_high)),
                                0);

        const uint8_t *row0 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
        __m128i shuffle0 = __lsx_vld(row0, 1);
        __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);

        const uint8_t *row1 =
            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
        __m128i shuffle1 = __lsx_vld(row1, 1);
        __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);

        __lsx_vst(utf8_0, utf8_output, 0);
        utf8_output += row0[0];
        __lsx_vst(utf8_1, utf8_output, 0);
        utf8_output += row1[0];

        buf += 8;
      }
      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
      // will produce four UTF-8 bytes.
    } else {
      // Let us do a scalar fallback.
      // It may seem wasteful to use scalar code, but being efficient with SIMD
      // in the presence of surrogate pairs may require non-trivial tables.
      size_t forward = 15;
      size_t k = 0;
      if (size_t(end - buf) < forward + 1) {
        forward = size_t(end - buf - 1);
      }
      for (; k < forward; k++) {
        uint32_t word = buf[k];
        if ((word & 0xFFFFFF80) == 0) {
          *utf8_output++ = char(word);
        } else if ((word & 0xFFFFF800) == 0) {
          *utf8_output++ = char((word >> 6) | 0b11000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else if ((word & 0xFFFF0000) == 0) {
          if (word >= 0xD800 && word <= 0xDFFF) {
            return std::make_pair(
                result(error_code::SURROGATE, buf - start + k),
                reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 12) | 0b11100000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        } else {
          if (word > 0x10FFFF) {
            return std::make_pair(
                result(error_code::TOO_LARGE, buf - start + k),
                reinterpret_cast<char *>(utf8_output));
          }
          *utf8_output++ = char((word >> 18) | 0b11110000);
          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
          *utf8_output++ = char((word & 0b111111) | 0b10000000);
        }
      }
      buf += k;
    }
  } // while

  return std::make_pair(result(error_code::SUCCESS, buf - start),
                        reinterpret_cast<char *>(utf8_output));
}
/* end file src/lsx/lsx_convert_utf32_to_utf8.cpp */
/* begin file src/lsx/lsx_base64.cpp */
/**
 * References and further reading:
 *
 * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
 * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
 * https://arxiv.org/abs/1910.05109
 *
 * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
 * Instructions, ACM Transactions on the Web 12 (3), 2018.
 * https://arxiv.org/abs/1704.00605
 *
 * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
 * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
 * Request for Comments: 4648.
 *
 * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
 * http://www.alfredklomp.com/programming/sse-base64/. (2014).
 *
 * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
 * acceleration. https://github.com/aklomp/base64. (2014).
 *
 * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
 * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
 *
 * Nick Kopp. 2013. Base64 Encoding on a GPU.
 * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
 */

template <bool isbase64url>
size_t encode_base64(char *dst, const char *src, size_t srclen,
                     base64_options options) {
  // credit: Wojciech Muła
  // SSE (lookup: pshufb improved unrolled)
  const uint8_t *input = (const uint8_t *)src;
  static const char *lookup_tbl =
      isbase64url
          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  uint8_t *out = (uint8_t *)dst;

  v16u8 shuf;
  __m128i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
      base64_tbl2, base64_tbl3;
  if (srclen >= 16) {
    shuf = v16u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
    v_fc0fc00 = __lsx_vreplgr2vr_w(uint32_t(0x0fc0fc00));
    v_3f03f0 = __lsx_vreplgr2vr_w(uint32_t(0x003f03f0));
    shift_r = __lsx_vreplgr2vr_w(uint32_t(0x0006000a));
    shift_l = __lsx_vreplgr2vr_w(uint32_t(0x00080004));
    base64_tbl0 = __lsx_vld(lookup_tbl, 0);
    base64_tbl1 = __lsx_vld(lookup_tbl, 16);
    base64_tbl2 = __lsx_vld(lookup_tbl, 32);
    base64_tbl3 = __lsx_vld(lookup_tbl, 48);
  }

  size_t i = 0;
  for (; i + 52 <= srclen; i += 48) {
    __m128i in0 =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
    __m128i in1 =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
    __m128i in2 =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
    __m128i in3 =
        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);

    in0 = __lsx_vshuf_b(in0, in0, (__m128i)shuf);
    in1 = __lsx_vshuf_b(in1, in1, (__m128i)shuf);
    in2 = __lsx_vshuf_b(in2, in2, (__m128i)shuf);
    in3 = __lsx_vshuf_b(in3, in3, (__m128i)shuf);

    __m128i t0_0 = __lsx_vand_v(in0, v_fc0fc00);
    __m128i t0_1 = __lsx_vand_v(in1, v_fc0fc00);
    __m128i t0_2 = __lsx_vand_v(in2, v_fc0fc00);
    __m128i t0_3 = __lsx_vand_v(in3, v_fc0fc00);

    __m128i t1_0 = __lsx_vsrl_h(t0_0, shift_r);
    __m128i t1_1 = __lsx_vsrl_h(t0_1, shift_r);
    __m128i t1_2 = __lsx_vsrl_h(t0_2, shift_r);
    __m128i t1_3 = __lsx_vsrl_h(t0_3, shift_r);

    __m128i t2_0 = __lsx_vand_v(in0, v_3f03f0);
    __m128i t2_1 = __lsx_vand_v(in1, v_3f03f0);
    __m128i t2_2 = __lsx_vand_v(in2, v_3f03f0);
    __m128i t2_3 = __lsx_vand_v(in3, v_3f03f0);

    __m128i t3_0 = __lsx_vsll_h(t2_0, shift_l);
    __m128i t3_1 = __lsx_vsll_h(t2_1, shift_l);
    __m128i t3_2 = __lsx_vsll_h(t2_2, shift_l);
    __m128i t3_3 = __lsx_vsll_h(t2_3, shift_l);

    __m128i input0 = __lsx_vor_v(t1_0, t3_0);
    __m128i input0_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input0);
    __m128i input0_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
                                         __lsx_vsub_b(input0, __lsx_vldi(32)));
    __m128i input0_mask = __lsx_vslei_bu(input0, 31);
    __m128i input0_result =
        __lsx_vbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
    __lsx_vst(input0_result, reinterpret_cast<__m128i *>(out), 0);
    out += 16;

    __m128i input1 = __lsx_vor_v(t1_1, t3_1);
    __m128i input1_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input1);
    __m128i input1_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
                                         __lsx_vsub_b(input1, __lsx_vldi(32)));
    __m128i input1_mask = __lsx_vslei_bu(input1, 31);
    __m128i input1_result =
        __lsx_vbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
    __lsx_vst(input1_result, reinterpret_cast<__m128i *>(out), 0);
    out += 16;

    __m128i input2 = __lsx_vor_v(t1_2, t3_2);
    __m128i input2_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input2);
    __m128i input2_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
                                         __lsx_vsub_b(input2, __lsx_vldi(32)));
    __m128i input2_mask = __lsx_vslei_bu(input2, 31);
    __m128i input2_result =
        __lsx_vbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
    __lsx_vst(input2_result, reinterpret_cast<__m128i *>(out), 0);
    out += 16;

    __m128i input3 = __lsx_vor_v(t1_3, t3_3);
    __m128i input3_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input3);
    __m128i input3_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
                                         __lsx_vsub_b(input3, __lsx_vldi(32)));
    __m128i input3_mask = __lsx_vslei_bu(input3, 31);
    __m128i input3_result =
        __lsx_vbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
    __lsx_vst(input3_result, reinterpret_cast<__m128i *>(out), 0);
    out += 16;
  }
  for (; i + 16 <= srclen; i += 12) {

    __m128i in = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);

    // bytes from groups A, B and C are needed in separate 32-bit lanes
    // in = [DDDD|CCCC|BBBB|AAAA]
    //
    //      an input triplet has layout
    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
    //        triplet
    //
    //      shuffling changes the order of bytes: 1, 0, 2, 1
    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
    //                  processed bits
    in = __lsx_vshuf_b(in, in, (__m128i)shuf);

    // unpacking
    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
    __m128i t0 = __lsx_vand_v(in, v_fc0fc00);
    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
    //          ((c >> 6),  (a >> 10))
    __m128i t1 = __lsx_vsrl_h(t0, shift_r);

    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
    __m128i t2 = __lsx_vand_v(in, v_3f03f0);
    // t3    = [00dddddd|00000000|00bbbbbb|00000000]
    //          ((d << 8), (b << 4))
    __m128i t3 = __lsx_vsll_h(t2, shift_l);

    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
    __m128i indices = __lsx_vor_v(t1, t3);

    __m128i indices_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, indices);
    __m128i indices_shuf1 = __lsx_vshuf_b(
        base64_tbl3, base64_tbl2, __lsx_vsub_b(indices, __lsx_vldi(32)));
    __m128i indices_mask = __lsx_vslei_bu(indices, 31);
    __m128i indices_result =
        __lsx_vbitsel_v(indices_shuf1, indices_shuf0, indices_mask);

    __lsx_vst(indices_result, reinterpret_cast<__m128i *>(out), 0);
    out += 16;
  }

  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                        srclen - i, options);
}

static inline void compress(__m128i data, uint16_t mask, char *output) {
  if (mask == 0) {
    __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
    return;
  }
  // this particular implementation was inspired by work done by @animetosho
  // we do it in two steps, first 8 bytes and then second 8 bytes
  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
  // next line just loads the 64-bit values thintable_epi8[mask1] and
  // thintable_epi8[mask2] into a 128-bit register, using only
  // two instructions on most compilers.

  v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
                    tables::base64::thintable_epi8[mask2]};

  // we increment by 0x08 the second half of the mask
  v4u32 hi = {0, 0, 0x08080808, 0x08080808};
  __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);

  // this is the version "nearly pruned"
  __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
  // we still need to put the two halves together.
  // we compute the popcount of the first half:
  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
  // then load the corresponding mask, what it does is to write
  // only the first pop1 bytes from the first 8 bytes, and then
  // it fills in with the bytes from the second 8 bytes + some filling
  // at the end.
  __m128i compactmask =
      __lsx_vld(reinterpret_cast<const __m128i *>(
                    tables::base64::pshufb_combine_table + pop1 * 8),
                0);
  __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);

  __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
}

struct block64 {
  __m128i chunks[4];
};

template <bool base64_url, bool default_or_url>
static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
  const v16u8 ascii_space_tbl = {0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
                                 0x0,  0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0};
  // credit: aqrit
  /*
  '0'(0x30)-'9'(0x39) => delta_values_index = 4
  'A'(0x41)-'Z'(0x5a) => delta_values_index = 4/5/12(4+8)
  'a'(0x61)-'z'(0x7a) => delta_values_index = 6/7/14(6+8)
  '+'(0x2b)           => delta_values_index = 3
  '/'(0x2f)           => delta_values_index = 2+8 = 10
  '-'(0x2d)           => delta_values_index = 2+8 = 10
  '_'(0x5f)           => delta_values_index = 5+8 = 13
  */
  v16u8 delta_asso;
  if (default_or_url) {
    delta_asso = v16u8{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                       0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x16};
  } else {
    delta_asso = v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
                       0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF};
  }
  v16i8 delta_values;
  if (default_or_url) {
    delta_values =
        v16i8{int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0x13),
              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
              int8_t(0xB9), int8_t(0x00), int8_t(0xFF), int8_t(0x11),
              int8_t(0xFF), int8_t(0xBF), int8_t(0x10), int8_t(0xB9)};
  } else if (base64_url) {
    delta_values =
        v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
              int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
              int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)};
  } else {
    delta_values =
        v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
              int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
              int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)};
  }

  v16u8 check_asso;
  if (default_or_url) {
    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x0E, 0x0B, 0x06};
  } else if (base64_url) {
    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x06, 0x0B, 0x12};
  } else {
    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F};
  }

  v16i8 check_values;
  if (default_or_url) {
    check_values =
        v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
              int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
              int8_t(0xB5), int8_t(0xA1), int8_t(0x00), int8_t(0x80),
              int8_t(0x00), int8_t(0x80), int8_t(0x00), int8_t(0x80)};
  } else if (base64_url) {
    check_values = v16i8{int8_t(0x0),  int8_t(0x80), int8_t(0x80), int8_t(0x80),
                         int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
                         int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
                         int8_t(0xB0), int8_t(0x80), int8_t(0x0),  int8_t(0x0)};
  } else {
    check_values =
        v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
              int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
              int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
              int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)};
  }

  const __m128i shifted = __lsx_vsrli_b(*src, 3);
  __m128i asso_index = __lsx_vand_v(*src, __lsx_vldi(0xF));
  const __m128i delta_hash =
      __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)delta_asso, (__m128i)delta_asso,
                                   (__m128i)asso_index),
                     shifted);
  const __m128i check_hash =
      __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)check_asso, (__m128i)check_asso,
                                   (__m128i)asso_index),
                     shifted);

  const __m128i out =
      __lsx_vsadd_b(__lsx_vshuf_b((__m128i)delta_values, (__m128i)delta_values,
                                  (__m128i)delta_hash),
                    *src);
  const __m128i chk =
      __lsx_vsadd_b(__lsx_vshuf_b((__m128i)check_values, (__m128i)check_values,
                                  (__m128i)check_hash),
                    *src);
  unsigned int mask = __lsx_vpickve2gr_hu(__lsx_vmskltz_b(chk), 0);
  if (mask) {
    __m128i ascii_space = __lsx_vseq_b(__lsx_vshuf_b((__m128i)ascii_space_tbl,
                                                     (__m128i)ascii_space_tbl,
                                                     (__m128i)asso_index),
                                       *src);
    *error |=
        (mask != __lsx_vpickve2gr_hu(__lsx_vmskltz_b((__m128i)ascii_space), 0));
  }

  *src = out;
  return (uint16_t)mask;
}

template <bool base64_url, bool default_or_url>
static inline uint64_t to_base64_mask(block64 *b, bool *error) {
  *error = 0;
  uint64_t m0 =
      to_base64_mask<base64_url, default_or_url>(&b->chunks[0], error);
  uint64_t m1 =
      to_base64_mask<base64_url, default_or_url>(&b->chunks[1], error);
  uint64_t m2 =
      to_base64_mask<base64_url, default_or_url>(&b->chunks[2], error);
  uint64_t m3 =
      to_base64_mask<base64_url, default_or_url>(&b->chunks[3], error);
  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
}

static inline void copy_block(block64 *b, char *output) {
  __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output), 0);
  __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output), 16);
  __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output), 32);
  __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output), 48);
}

static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
  uint64_t nmask = ~mask;
  uint64_t count =
      __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
  uint16_t *count_ptr = (uint16_t *)&count;
  compress(b->chunks[0], uint16_t(mask), output);
  compress(b->chunks[1], uint16_t(mask >> 16), output + count_ptr[0]);
  compress(b->chunks[2], uint16_t(mask >> 32),
           output + count_ptr[0] + count_ptr[1]);
  compress(b->chunks[3], uint16_t(mask >> 48),
           output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
  return count_ones(nmask);
}

template <typename T> bool is_power_of_two(T x) { return (x & (x - 1)) == 0; }

inline size_t compress_block_single(block64 *b, uint64_t mask, char *output) {
  const size_t pos64 = trailing_zeroes(mask);
  const int8_t pos = pos64 & 0xf;
  // Predefine the index vector
  const v16u8 v1 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};

  switch (pos64 >> 4) {
  case 0b00: {
    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1); // v1 > v0
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(b->chunks[0], b->chunks[0], sh);
    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 0 * 16), 0);
    __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output + 1 * 16 - 1),
              0);
    __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output + 2 * 16 - 1),
              0);
    __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output + 3 * 16 - 1),
              0);
  } break;

  case 0b01: {
    __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output + 0 * 16), 0);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1);
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(b->chunks[1], b->chunks[1], sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 1 * 16), 0);
    __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output + 2 * 16 - 1),
              0);
    __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output + 3 * 16 - 1),
              0);
  } break;

  case 0b10: {
    __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output + 0 * 16), 0);
    __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output + 1 * 16), 0);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1);
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(b->chunks[2], b->chunks[2], sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 2 * 16), 0);
    __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output + 3 * 16 - 1),
              0);
  } break;

  case 0b11: {
    __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output + 0 * 16), 0);
    __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output + 1 * 16), 0);
    __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output + 2 * 16), 0);

    const __m128i v0 = __lsx_vreplgr2vr_b((uint8_t)(pos - 1));
    const __m128i v2 = __lsx_vslt_b(v0, (__m128i)v1);
    const __m128i sh = __lsx_vsub_b((__m128i)v1, v2);
    const __m128i compressed = __lsx_vshuf_b(b->chunks[3], b->chunks[3], sh);

    __lsx_vst(compressed, reinterpret_cast<__m128i *>(output + 3 * 16), 0);
  } break;
  }
  return 63;
}

// The caller of this function is responsible to ensure that there are 64 bytes
// available from reading at src. The data is read into a block64 structure.
static inline void load_block(block64 *b, const char *src) {
  b->chunks[0] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
  b->chunks[1] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
  b->chunks[2] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
  b->chunks[3] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
}

// The caller of this function is responsible to ensure that there are 128 bytes
// available from reading at src. The data is read into a block64 structure.
static inline void load_block(block64 *b, const char16_t *src) {
  __m128i m1 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
  __m128i m2 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
  __m128i m3 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
  __m128i m4 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
  __m128i m5 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 64);
  __m128i m6 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 80);
  __m128i m7 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 96);
  __m128i m8 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 112);
  b->chunks[0] = __lsx_vssrlni_bu_h(m2, m1, 0);
  b->chunks[1] = __lsx_vssrlni_bu_h(m4, m3, 0);
  b->chunks[2] = __lsx_vssrlni_bu_h(m6, m5, 0);
  b->chunks[3] = __lsx_vssrlni_bu_h(m8, m7, 0);
}

static inline void base64_decode(char *out, __m128i str) {
  __m128i t0 = __lsx_vor_v(
      __lsx_vslli_w(str, 26),
      __lsx_vslli_w(__lsx_vand_v(str, lsx_splat_u32(0x0000FF00)), 12));
  __m128i t1 = __lsx_vsrli_w(__lsx_vand_v(str, lsx_splat_u32(0x003F0000)), 2);
  __m128i t2 = __lsx_vor_v(t0, t1);
  __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16));
  const v16u8 pack_shuffle = {3, 2,  1,  7,  6, 5, 11, 10,
                              9, 15, 14, 13, 0, 0, 0,  0};
  t3 = __lsx_vshuf_b(t3, t3, (__m128i)pack_shuffle);

  // Store the output:
  // we only need 12.
  __lsx_vstelm_d(t3, out, 0, 0);
  __lsx_vstelm_w(t3, out + 8, 0, 2);
}
// decode 64 bytes and output 48 bytes
static inline void base64_decode_block(char *out, const char *src) {
  base64_decode(out, __lsx_vld(reinterpret_cast<const __m128i *>(src), 0));
  base64_decode(out + 12,
                __lsx_vld(reinterpret_cast<const __m128i *>(src), 16));
  base64_decode(out + 24,
                __lsx_vld(reinterpret_cast<const __m128i *>(src), 32));
  base64_decode(out + 36,
                __lsx_vld(reinterpret_cast<const __m128i *>(src), 48));
}
static inline void base64_decode_block_safe(char *out, const char *src) {
  base64_decode_block(out, src);
}
static inline void base64_decode_block(char *out, block64 *b) {
  base64_decode(out, b->chunks[0]);
  base64_decode(out + 12, b->chunks[1]);
  base64_decode(out + 24, b->chunks[2]);
  base64_decode(out + 36, b->chunks[3]);
}
static inline void base64_decode_block_safe(char *out, block64 *b) {
  base64_decode_block(out, b);
}

template <bool base64_url, bool ignore_garbage, bool default_or_url,
          typename char_type>
full_result
compress_decode_base64(char *dst, const char_type *src, size_t srclen,
                       base64_options options,
                       last_chunk_handling_options last_chunk_options) {
  const uint8_t *to_base64 =
      default_or_url ? tables::base64::to_base64_default_or_url_value
                     : (base64_url ? tables::base64::to_base64_url_value
                                   : tables::base64::to_base64_value);
  auto ri = simdutf::scalar::base64::find_end(src, srclen, options);
  size_t equallocation = ri.equallocation;
  size_t equalsigns = ri.equalsigns;
  srclen = ri.srclen;
  size_t full_input_length = ri.full_input_length;
  if (srclen == 0) {
    if (!ignore_garbage && equalsigns > 0) {
      return {INVALID_BASE64_CHARACTER, equallocation, 0};
    }
    return {SUCCESS, full_input_length, 0};
  }
  const char_type *const srcinit = src;
  const char *const dstinit = dst;
  const char_type *const srcend = src + srclen;

  constexpr size_t block_size = 10;
  char buffer[block_size * 64];
  char *bufferptr = buffer;
  if (srclen >= 64) {
    const char_type *const srcend64 = src + srclen - 64;
    while (src <= srcend64) {
      block64 b;
      load_block(&b, src);
      src += 64;
      bool error = false;
      uint64_t badcharmask =
          to_base64_mask<base64_url, default_or_url>(&b, &error);
      if (badcharmask) {
        if (error && !ignore_garbage) {
          src -= 64;
          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
                 to_base64[uint8_t(*src)] <= 64) {
            src++;
          }
          if (src < srcend) {
            // should never happen
          }
          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                  size_t(dst - dstinit)};
        }
      }

      if (badcharmask != 0) {
        if (is_power_of_two(badcharmask)) {
          bufferptr += compress_block_single(&b, badcharmask, bufferptr);
        } else {
          bufferptr += compress_block(&b, badcharmask, bufferptr);
        }
      } else {
        // optimization opportunity: if bufferptr == buffer and mask == 0, we
        // can avoid the call to compress_block and decode directly.
        copy_block(&b, bufferptr);
        bufferptr += 64;
      }
      if (bufferptr >= (block_size - 1) * 64 + buffer) {
        for (size_t i = 0; i < (block_size - 1); i++) {
          base64_decode_block(dst, buffer + i * 64);
          dst += 48;
        }
        std::memcpy(buffer, buffer + (block_size - 1) * 64,
                    64); // 64 might be too much
        bufferptr -= (block_size - 1) * 64;
      }
    }
  }
  char *buffer_start = buffer;
  // Optimization note: if this is almost full, then it is worth our
  // time, otherwise, we should just decode directly.
  int last_block = (int)((bufferptr - buffer_start) % 64);
  if (last_block != 0 && srcend - src + last_block >= 64) {
    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
      uint8_t val = to_base64[uint8_t(*src)];
      *bufferptr = char(val);
      if ((!scalar::base64::is_eight_byte(*src) || val > 64) &&
          !ignore_garbage) {
        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
                size_t(dst - dstinit)};
      }
      bufferptr += (val <= 63);
      src++;
    }
  }

  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
    base64_decode_block(dst, buffer_start);
    dst += 48;
  }
  if ((bufferptr - buffer_start) % 64 != 0) {
    while (buffer_start + 4 < bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
      // lsx is little-endian
      triple = scalar::u32_swap_bytes(triple);
      std::memcpy(dst, &triple, 4);

      dst += 3;
      buffer_start += 4;
    }
    if (buffer_start + 4 <= bufferptr) {
      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
                        << 8;
      // lsx is little-endian
      triple = scalar::u32_swap_bytes(triple);
      std::memcpy(dst, &triple, 3);

      dst += 3;
      buffer_start += 4;
    }
    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
    // backtrack
    int leftover = int(bufferptr - buffer_start);
    while (leftover > 0) {
      if (!ignore_garbage) {
        while (to_base64[uint8_t(*(src - 1))] == 64) {
          src--;
        }
      } else {
        while (to_base64[uint8_t(*(src - 1))] >= 64) {
          src--;
        }
      }
      src--;
      leftover--;
    }
  }
  if (src < srcend + equalsigns) {
    full_result r = scalar::base64::base64_tail_decode(
        dst, src, srcend - src, equalsigns, options, last_chunk_options);
    r = scalar::base64::patch_tail_result(
        r, size_t(src - srcinit), size_t(dst - dstinit), equallocation,
        full_input_length, last_chunk_options);
    // When is_partial(last_chunk_options) is true, we must either end with
    // the end of the stream (beyond whitespace) or right after a non-ignorable
    // character or at the very beginning of the stream.
    // See https://tc39.es/proposal-arraybuffer-base64/spec/#sec-frombase64
    if (is_partial(last_chunk_options) && r.error == error_code::SUCCESS &&
        r.input_count < full_input_length) {
      // First check if we can extend the input to the end of the stream
      while (r.input_count < full_input_length &&
             base64_ignorable(*(srcinit + r.input_count), options)) {
        r.input_count++;
      }
      // If we are still not at the end of the stream, then we must backtrack
      // to the last non-ignorable character.
      if (r.input_count < full_input_length) {
        while (r.input_count > 0 &&
               base64_ignorable(*(srcinit + r.input_count - 1), options)) {
          r.input_count--;
        }
      }
    }
    return r;
  }
  if (equalsigns > 0 && !ignore_garbage) {
    if ((size_t(dst - dstinit) % 3 == 0) ||
        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
    }
  }
  return {SUCCESS, srclen, size_t(dst - dstinit)};
}
/* end file src/lsx/lsx_base64.cpp */
/* begin file src/lsx/lsx_find.cpp */
simdutf_really_inline const char *util_find(const char *start, const char *end,
                                            char character) noexcept {
  if (start >= end)
    return end;

  const int step = 16;
  __m128i char_vec = __lsx_vreplgr2vr_b(static_cast<uint8_t>(character));

  while (end - start >= step) {
    __m128i data = __lsx_vld(reinterpret_cast<const __m128i *>(start), 0);
    __m128i cmp = __lsx_vseq_b(data, char_vec);
    if (__lsx_bnz_v(cmp)) {
      uint16_t mask =
          static_cast<uint16_t>(__lsx_vpickve2gr_hu(__lsx_vmsknz_b(cmp), 0));
      return start + trailing_zeroes(mask);
    }

    start += step;
  }

  // Handle remaining bytes with scalar loop
  for (; start < end; ++start) {
    if (*start == character) {
      return start;
    }
  }

  return end;
}

simdutf_really_inline const char16_t *util_find(const char16_t *start,
                                                const char16_t *end,
                                                char16_t character) noexcept {
  if (start >= end)
    return end;

  const int step = 8;
  __m128i char_vec = __lsx_vreplgr2vr_h(static_cast<uint16_t>(character));

  while (end - start >= step) {
    __m128i data = __lsx_vld(reinterpret_cast<const __m128i *>(start), 0);
    __m128i cmp = __lsx_vseq_h(data, char_vec);
    if (__lsx_bnz_v(cmp)) {
      uint16_t mask =
          static_cast<uint16_t>(__lsx_vpickve2gr_hu(__lsx_vmsknz_b(cmp), 0));
      return start + trailing_zeroes(mask) / 2;
    }

    start += step;
  }

  // Handle remaining elements with scalar loop
  for (; start < end; ++start) {
    if (*start == character) {
      return start;
    }
  }

  return end;
}
/* end file src/lsx/lsx_find.cpp */

} // namespace
} // namespace lsx
} // namespace simdutf

/* begin file src/generic/buf_block_reader.h */
namespace simdutf {
namespace lsx {
namespace {

// Walks through a buffer in block-sized increments, loading the last part with
// spaces
template <size_t STEP_SIZE> struct buf_block_reader {
public:
  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
  simdutf_really_inline size_t block_index();
  simdutf_really_inline bool has_full_block() const;
  simdutf_really_inline const uint8_t *full_block() const;
  /**
   * Get the last block, padded with spaces.
   *
   * There will always be a last block, with at least 1 byte, unless len == 0
   * (in which case this function fills the buffer with spaces and returns 0. In
   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
   * block with STEP_SIZE bytes and no spaces for padding.
   *
   * @return the number of effective characters in the last block.
   */
  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
  simdutf_really_inline void advance();

private:
  const uint8_t *buf;
  const size_t len;
  const size_t lenminusstep;
  size_t idx;
};

template <size_t STEP_SIZE>
simdutf_really_inline
buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
      idx{0} {}

template <size_t STEP_SIZE>
simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
  return idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
  return idx < lenminusstep;
}

template <size_t STEP_SIZE>
simdutf_really_inline const uint8_t *
buf_block_reader<STEP_SIZE>::full_block() const {
  return &buf[idx];
}

template <size_t STEP_SIZE>
simdutf_really_inline size_t
buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
  if (len == idx) {
    return 0;
  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
  std::memset(dst, 0x20,
              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
                          // to write out 8 or 16 bytes at once.
  std::memcpy(dst, buf + idx, len - idx);
  return len - idx;
}

template <size_t STEP_SIZE>
simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
  idx += STEP_SIZE;
}

} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/buf_block_reader.h */
/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8_validation {

using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

//
// Return nonzero if there are incomplete multibyte characters at the end of the
// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
//
simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
  // If the previous input's last 3 bytes match this, they're too short (they
  // ended at EOF):
  // ... 1111____ 111_____ 11______
  static const uint8_t max_array[32] = {255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        255,
                                        0b11110000u - 1,
                                        0b11100000u - 1,
                                        0b11000000u - 1};
  const simd8<uint8_t> max_value(
      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
  return input.gt_bits(max_value);
}

struct utf8_checker {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;
  // The last input we received
  simd8<uint8_t> prev_input_block;
  // Whether the last input we received was incomplete (used for ASCII fast
  // path)
  simd8<uint8_t> prev_incomplete;

  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  // The only problem that can happen at EOF is that a multibyte character is
  // too short or a byte value too large in the last bytes: check_special_cases
  // only checks for bytes too large in the first of two bytes.
  simdutf_really_inline void check_eof() {
    // If the previous block had incomplete UTF-8 characters at the end, an
    // ASCII block can't possibly finish them.
    this->error |= this->prev_incomplete;
  }

  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
    if (simdutf_likely(is_ascii(input))) {
      this->error |= this->prev_incomplete;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
                    "We support either two or four chunks per 64-byte block.");
      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
      }
      this->prev_incomplete =
          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
    }
  }

  // do not forget to call check_eof!
  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_validation

using utf8_validation::utf8_checker;

} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
/* begin file src/generic/utf8_validation/utf8_validator.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8_validation {

/**
 * Validates that the string is actual UTF-8.
 */
template <class checker>
bool generic_validate_utf8(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  return !c.errors();
}

bool generic_validate_utf8(const char *input, size_t length) {
  return generic_validate_utf8<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

/**
 * Validates that the string is actual UTF-8 and stops on errors.
 */
template <class checker>
result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
  checker c{};
  buf_block_reader<64> reader(input, length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    c.check_next_input(in);
    if (c.errors()) {
      if (count != 0) {
        count--;
      } // Sometimes the error is only detected in the next chunk
      result res = scalar::utf8::rewind_and_validate_with_errors(
          reinterpret_cast<const char *>(input),
          reinterpret_cast<const char *>(input + count), length - count);
      res.count += count;
      return res;
    }
    reader.advance();
    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  c.check_next_input(in);
  reader.advance();
  c.check_eof();
  if (c.errors()) {
    if (count != 0) {
      count--;
    } // Sometimes the error is only detected in the next chunk
    result res = scalar::utf8::rewind_and_validate_with_errors(
        reinterpret_cast<const char *>(input),
        reinterpret_cast<const char *>(input) + count, length - count);
    res.count += count;
    return res;
  } else {
    return result(error_code::SUCCESS, length);
  }
}

result generic_validate_utf8_with_errors(const char *input, size_t length) {
  return generic_validate_utf8_with_errors<utf8_checker>(
      reinterpret_cast<const uint8_t *>(input), length);
}

} // namespace utf8_validation
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf8_validation/utf8_validator.h */
/* begin file src/generic/ascii_validation.h */
namespace simdutf {
namespace lsx {
namespace {
namespace ascii_validation {

result generic_validate_ascii_with_errors(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  size_t count{0};
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      result res = scalar::ascii::validate_with_errors(
          reinterpret_cast<const char *>(input + count), length - count);
      return result(res.error, count + res.count);
    }
    reader.advance();

    count += 64;
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  if (!in.is_ascii()) {
    result res = scalar::ascii::validate_with_errors(
        reinterpret_cast<const char *>(input + count), length - count);
    return result(res.error, count + res.count);
  } else {
    return result(error_code::SUCCESS, length);
  }
}

bool generic_validate_ascii(const char *input, size_t length) {
  buf_block_reader<64> reader(reinterpret_cast<const uint8_t *>(input), length);
  while (reader.has_full_block()) {
    simd::simd8x64<uint8_t> in(reader.full_block());
    if (!in.is_ascii()) {
      return false;
    }
    reader.advance();
  }
  uint8_t block[64]{};
  reader.get_remainder(block);
  simd::simd8x64<uint8_t> in(block);
  return in.is_ascii();
}

} // namespace ascii_validation
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/ascii_validation.h */

  // transcoding from UTF-8 to Latin 1
/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
  // 0b11000010 and nothing else.
  //
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
  constexpr const uint8_t FORBIDDEN = 0xff;

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      FORBIDDEN,
      // 1110____ ________ <three byte lead in byte 1>
      FORBIDDEN,
      // 1111____ ________ <four+ byte lead in byte 1>
      FORBIDDEN);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              FORBIDDEN,
              // ____0101 ________
              FORBIDDEN,
              // ____011_ ________
              FORBIDDEN, FORBIDDEN,

              // ____1___ ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
              // ____1101 ________
              FORBIDDEN, FORBIDDEN, FORBIDDEN);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    this->error |= check_special_cases(input, prev1);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 16; margin--) {
      leading_byte += (int8_t(in[margin - 1]) >
                       -65); // twos complement of -65 is 1011 1111 ...
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask =
            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                               // this case, we also have ASCII to account for.
        if (utf8_continuation_mask & 1) {
          return 0; // error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
      if (howmany == 0) {
        return 0;
      }
      latin1_output += howmany;
    }
    return latin1_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char *latin1_output) {
    size_t pos = 0;
    char *start{latin1_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the eight last
    // leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store((int8_t *)latin1_output);
        latin1_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        if (errors()) {
          // rewind_and_convert_with_errors will seek a potential error from
          // in+pos onward, with the ability to go back up to pos bytes, and
          // read size-pos bytes forward.
          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, latin1_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_latin1(
              in + pos, utf8_end_of_code_point_mask, latin1_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      // rewind_and_convert_with_errors will seek a potential error from in+pos
      // onward, with the ability to go back up to pos bytes, and read size-pos
      // bytes forward.
      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, latin1_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        latin1_output += res.count;
      }
    }
    return result(error_code::SUCCESS, latin1_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_latin1
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8_to_latin1 {
using namespace simd;

simdutf_really_inline size_t convert_valid(const char *in, size_t size,
                                           char *latin1_output) {
  size_t pos = 0;
  char *start{latin1_output};
  // In the worst case, we have the haswell kernel which can cause an overflow
  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
  // 16 bytes, and if the data is valid, then it is entirely safe because 16
  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
  // assume that you have valid UTF-8 input, so we are going to go back from the
  // end counting 8 leading bytes, to give us a good margin.
  size_t leading_byte = 0;
  size_t margin = size;
  for (; margin > 0 && leading_byte < 8; margin--) {
    leading_byte += (int8_t(in[margin - 1]) >
                     -65); // twos complement of -65 is 1011 1111 ...
  }
  // If the input is long enough, then we have that margin-1 is the eight last
  // leading byte.
  const size_t safety_margin = size - margin + 1; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    if (input.is_ascii()) {
      input.store((int8_t *)latin1_output);
      latin1_output += 64;
      pos += 64;
    } else {
      // you might think that a for-loop would work, but under Visual Studio, it
      // is not good enough.
      uint64_t utf8_continuation_mask =
          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
                             // this case, we also have ASCII to account for.
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      // We process in blocks of up to 12 bytes except possibly
      // for fast paths which may process up to 16 bytes. For the
      // slow path to work, we should have at least 12 input bytes left.
      size_t max_starting_point = (pos + 64) - 12;
      // Next loop is going to run at least five times.
      while (pos < max_starting_point) {
        // Performance note: our ability to compute 'consumed' and
        // then shift and recompute is critical. If there is a
        // latency of, say, 4 cycles on getting 'consumed', then
        // the inner loop might have a total latency of about 6 cycles.
        // Yet we process between 6 to 12 inputs bytes, thus we get
        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
        // for this section of the code. Hence, there is a limit
        // to how much we can further increase this latency before
        // it seriously harms performance.
        size_t consumed = convert_masked_utf8_to_latin1(
            in + pos, utf8_end_of_code_point_mask, latin1_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
      // At this point there may remain between 0 and 12 bytes in the
      // 64-byte block. These bytes will be processed again. So we have an
      // 80% efficiency (in the worst case). In practice we expect an
      // 85% to 90% efficiency.
    }
  }
  if (pos < size) {
    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
                                                           latin1_output);
    latin1_output += howmany;
  }
  return latin1_output - start;
}

} // namespace utf8_to_latin1
} // namespace
} // namespace lsx
} // namespace simdutf
  // namespace simdutf
/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */

  // transcoding from UTF-8 to UTF-32
/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8_to_utf32 {

using namespace simd;

simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
                                         char32_t *utf32_output) noexcept {
  size_t pos = 0;
  char32_t *start{utf32_output};
  const size_t safety_margin = 16; // to avoid overruns!
  while (pos + 64 + safety_margin <= size) {
    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
    if (in.is_ascii()) {
      in.store_ascii_as_utf32(utf32_output);
      utf32_output += 64;
      pos += 64;
    } else {
      // -65 is 0b10111111 in two-complement's, so largest possible continuation
      // byte
      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
      size_t max_starting_point = (pos + 64) - 12;
      while (pos < max_starting_point) {
        size_t consumed = convert_masked_utf8_to_utf32(
            input + pos, utf8_end_of_code_point_mask, utf32_output);
        pos += consumed;
        utf8_end_of_code_point_mask >>= consumed;
      }
    }
  }
  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
                                                       utf32_output);
  return utf32_output - start;
}

} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8_to_utf32 {
using namespace simd;

simdutf_really_inline simd8<uint8_t>
check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
  // Bit 1 = Too Long (ASCII followed by continuation)
  // Bit 2 = Overlong 3-byte
  // Bit 4 = Surrogate
  // Bit 5 = Overlong 2-byte
  // Bit 7 = Two Continuations
  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
                                               // 11______ 11______
  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
                                               // 11110100 101_____
                                               // 11110101 1001____
                                               // 11110101 101_____
                                               // 1111011_ 1001____
                                               // 1111011_ 101_____
                                               // 11111___ 1001____
                                               // 11111___ 101_____
  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
  // 11110101 1000____
  // 1111011_ 1000____
  // 11111___ 1000____
  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____

  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
      // 0_______ ________ <ASCII in byte 1>
      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
      TOO_LONG,
      // 10______ ________ <continuation in byte 1>
      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
      // 1100____ ________ <two byte lead in byte 1>
      TOO_SHORT | OVERLONG_2,
      // 1101____ ________ <two byte lead in byte 1>
      TOO_SHORT,
      // 1110____ ________ <three byte lead in byte 1>
      TOO_SHORT | OVERLONG_3 | SURROGATE,
      // 1111____ ________ <four+ byte lead in byte 1>
      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
  constexpr const uint8_t CARRY =
      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
  const simd8<uint8_t> byte_1_low =
      (prev1 & 0x0F)
          .lookup_16<uint8_t>(
              // ____0000 ________
              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
              // ____0001 ________
              CARRY | OVERLONG_2,
              // ____001_ ________
              CARRY, CARRY,

              // ____0100 ________
              CARRY | TOO_LARGE,
              // ____0101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____011_ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,

              // ____1___ ________
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              // ____1101 ________
              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
              CARRY | TOO_LARGE | TOO_LARGE_1000,
              CARRY | TOO_LARGE | TOO_LARGE_1000);
  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
      // ________ 0_______ <ASCII in byte 2>
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
      TOO_SHORT, TOO_SHORT,

      // ________ 1000____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
          OVERLONG_4,
      // ________ 1001____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
      // ________ 101_____
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,

      // ________ 11______
      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
  return (byte_1_high & byte_1_low & byte_2_high);
}
simdutf_really_inline simd8<uint8_t>
check_multibyte_lengths(const simd8<uint8_t> input,
                        const simd8<uint8_t> prev_input,
                        const simd8<uint8_t> sc) {
  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
  simd8<uint8_t> must23 =
      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
  return must23_80 ^ sc;
}

struct validating_transcoder {
  // If this is nonzero, there has been a UTF-8 error.
  simd8<uint8_t> error;

  validating_transcoder() : error(uint8_t(0)) {}
  //
  // Check whether the current bytes are valid UTF-8.
  //
  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
                                              const simd8<uint8_t> prev_input) {
    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
    // small negative numbers)
    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
    simd8<uint8_t> sc = check_special_cases(input, prev1);
    this->error |= check_multibyte_lengths(input, prev_input, sc);
  }

  simdutf_really_inline size_t convert(const char *in, size_t size,
                                       char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 16 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (utf8_continuation_mask & 1) {
          return 0; // we have an error
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      return 0;
    }
    if (pos < size) {
      size_t howmany =
          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
      if (howmany == 0) {
        return 0;
      }
      utf32_output += howmany;
    }
    return utf32_output - start;
  }

  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
                                                   char32_t *utf32_output) {
    size_t pos = 0;
    char32_t *start{utf32_output};
    // In the worst case, we have the haswell kernel which can cause an overflow
    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
    // last 16 bytes, and if the data is valid, then it is entirely safe because
    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
    // generally assume that you have valid UTF-8 input, so we are going to go
    // back from the end counting 8 leading bytes, to give us a good margin.
    size_t leading_byte = 0;
    size_t margin = size;
    for (; margin > 0 && leading_byte < 8; margin--) {
      leading_byte += (int8_t(in[margin - 1]) > -65);
    }
    // If the input is long enough, then we have that margin-1 is the fourth
    // last leading byte.
    const size_t safety_margin = size - margin + 1; // to avoid overruns!
    while (pos + 64 + safety_margin <= size) {
      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
      if (input.is_ascii()) {
        input.store_ascii_as_utf32(utf32_output);
        utf32_output += 64;
        pos += 64;
      } else {
        // you might think that a for-loop would work, but under Visual Studio,
        // it is not good enough.
        static_assert(
            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
            "We support either two or four chunks per 64-byte block.");
        auto zero = simd8<uint8_t>{uint8_t(0)};
        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
          this->check_utf8_bytes(input.chunks[0], zero);
          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
        }
        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
        if (errors() || (utf8_continuation_mask & 1)) {
          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
              pos, in + pos, size - pos, utf32_output);
          res.count += pos;
          return res;
        }
        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
        // We process in blocks of up to 12 bytes except possibly
        // for fast paths which may process up to 16 bytes. For the
        // slow path to work, we should have at least 12 input bytes left.
        size_t max_starting_point = (pos + 64) - 12;
        // Next loop is going to run at least five times.
        while (pos < max_starting_point) {
          // Performance note: our ability to compute 'consumed' and
          // then shift and recompute is critical. If there is a
          // latency of, say, 4 cycles on getting 'consumed', then
          // the inner loop might have a total latency of about 6 cycles.
          // Yet we process between 6 to 12 inputs bytes, thus we get
          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
          // for this section of the code. Hence, there is a limit
          // to how much we can further increase this latency before
          // it seriously harms performance.
          size_t consumed = convert_masked_utf8_to_utf32(
              in + pos, utf8_end_of_code_point_mask, utf32_output);
          pos += consumed;
          utf8_end_of_code_point_mask >>= consumed;
        }
        // At this point there may remain between 0 and 12 bytes in the
        // 64-byte block. These bytes will be processed again. So we have an
        // 80% efficiency (in the worst case). In practice we expect an
        // 85% to 90% efficiency.
      }
    }
    if (errors()) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      res.count += pos;
      return res;
    }
    if (pos < size) {
      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
          pos, in + pos, size - pos, utf32_output);
      if (res.error) { // In case of error, we want the error position
        res.count += pos;
        return res;
      } else { // In case of success, we want the number of word written
        utf32_output += res.count;
      }
    }
    return result(error_code::SUCCESS, utf32_output - start);
  }

  simdutf_really_inline bool errors() const {
    return this->error.any_bits_set_anywhere();
  }

}; // struct utf8_checker
} // namespace utf8_to_utf32
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */

/* begin file src/generic/utf8.h */
namespace simdutf {
namespace lsx {
namespace {
namespace utf8 {

using namespace simd;

simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.gt(-65);
    count += count_ones(utf8_continuation_mask);
  }
  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
simdutf_really_inline size_t count_code_points_bytemask(const char *in,
                                                        size_t size) {
  using vector_i8 = simd8<int8_t>;
  using vector_u8 = simd8<uint8_t>;
  using vector_u64 = simd64<uint64_t>;

  constexpr size_t N = vector_i8::SIZE;
  constexpr size_t max_iterations = 255 / 4;

  size_t pos = 0;
  size_t count = 0;

  auto counters = vector_u64::zero();
  auto local = vector_u8::zero();
  size_t iterations = 0;
  for (; pos + 4 * N <= size; pos += 4 * N) {
    const auto input0 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 0 * N));
    const auto input1 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 1 * N));
    const auto input2 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 2 * N));
    const auto input3 =
        simd8<int8_t>::load(reinterpret_cast<const int8_t *>(in + pos + 3 * N));
    const auto mask0 = input0 > int8_t(-65);
    const auto mask1 = input1 > int8_t(-65);
    const auto mask2 = input2 > int8_t(-65);
    const auto mask3 = input3 > int8_t(-65);

    local -= vector_u8(mask0);
    local -= vector_u8(mask1);
    local -= vector_u8(mask2);
    local -= vector_u8(mask3);

    iterations += 1;
    if (iterations == max_iterations) {
      counters += sum_8bytes(local);
      local = vector_u8::zero();
      iterations = 0;
    }
  }

  if (iterations > 0) {
    count += local.sum_bytes();
  }

  count += counters.sum();

  return count + scalar::utf8::count_code_points(in + pos, size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos + 64 <= size; pos += 64) {
    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
    // We count one word for anything that is not a continuation (so
    // leading bytes).
    count += 64 - count_ones(utf8_continuation_mask);
    int64_t utf8_4byte = input.gteq_unsigned(240);
    count += count_ones(utf8_4byte);
  }
  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
}

} // namespace utf8
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf8.h */

/* begin file src/generic/utf32.h */
#include <limits>

namespace simdutf {
namespace lsx {
namespace {
namespace utf32 {

template <typename T> T min(T a, T b) { return a <= b ? a : b; }

simdutf_really_inline size_t utf8_length_from_utf32(const char32_t *input,
                                                    size_t length) {
  using vector_u32 = simd32<uint32_t>;

  const char32_t *start = input;

  // we add up to three ones in a single iteration (see the vectorized loop in
  // section #2 below)
  const size_t max_increment = 3;

  const size_t N = vector_u32::ELEMENTS;

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
  const auto v_0000007f = vector_u32::splat(0x0000007f);
  const auto v_000007ff = vector_u32::splat(0x000007ff);
  const auto v_0000ffff = vector_u32::splat(0x0000ffff);
#else
  const auto v_ffffff80 = vector_u32::splat(0xffffff80);
  const auto v_fffff800 = vector_u32::splat(0xfffff800);
  const auto v_ffff0000 = vector_u32::splat(0xffff0000);
  const auto one = vector_u32::splat(1);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

  size_t counter = 0;

  // 1. vectorized loop unrolled 4 times
  {
    // we use vector of uint32 counters, this is why this limit is used
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / (max_increment * 4);
    size_t blocks = length / (N * 4);
    length -= blocks * (N * 4);
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      simd32<uint32_t> acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in0 = vector_u32(input + 0 * N);
        const auto in1 = vector_u32(input + 1 * N);
        const auto in2 = vector_u32(input + 2 * N);
        const auto in3 = vector_u32(input + 3 * N);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in0 > v_0000007f);
        acc -= as_vector_u32(in1 > v_0000007f);
        acc -= as_vector_u32(in2 > v_0000007f);
        acc -= as_vector_u32(in3 > v_0000007f);

        acc -= as_vector_u32(in0 > v_000007ff);
        acc -= as_vector_u32(in1 > v_000007ff);
        acc -= as_vector_u32(in2 > v_000007ff);
        acc -= as_vector_u32(in3 > v_000007ff);

        acc -= as_vector_u32(in0 > v_0000ffff);
        acc -= as_vector_u32(in1 > v_0000ffff);
        acc -= as_vector_u32(in2 > v_0000ffff);
        acc -= as_vector_u32(in3 > v_0000ffff);
#else
        acc += min(one, in0 & v_ffffff80);
        acc += min(one, in1 & v_ffffff80);
        acc += min(one, in2 & v_ffffff80);
        acc += min(one, in3 & v_ffffff80);

        acc += min(one, in0 & v_fffff800);
        acc += min(one, in1 & v_fffff800);
        acc += min(one, in2 & v_fffff800);
        acc += min(one, in3 & v_fffff800);

        acc += min(one, in0 & v_ffff0000);
        acc += min(one, in1 & v_ffff0000);
        acc += min(one, in2 & v_ffff0000);
        acc += min(one, in3 & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += 4 * N;
      }

      counter += acc.sum();
    }
  }

  // 2. vectorized loop for tail
  {
    const size_t max_iterations =
        std::numeric_limits<uint32_t>::max() / max_increment;
    size_t blocks = length / N;
    length -= blocks * N;
    while (blocks != 0) {
      const size_t iterations = min(blocks, max_iterations);
      blocks -= iterations;

      auto acc = vector_u32::zero();
      for (size_t i = 0; i < iterations; i++) {
        const auto in = vector_u32(input);

#if SIMDUTF_SIMD_HAS_UNSIGNED_CMP
        acc -= as_vector_u32(in > v_0000007f);
        acc -= as_vector_u32(in > v_000007ff);
        acc -= as_vector_u32(in > v_0000ffff);
#else
        acc += min(one, in & v_ffffff80);
        acc += min(one, in & v_fffff800);
        acc += min(one, in & v_ffff0000);
#endif // SIMDUTF_SIMD_HAS_UNSIGNED_CMP

        input += N;
      }

      counter += acc.sum();
    }
  }

  const size_t consumed = input - start;
  if (consumed != 0) {
    // We don't count 0th bytes in the vectorized loops above, this
    // is why we need to count them in the end.
    counter += consumed;
  }

  return counter + scalar::utf32::utf8_length_from_utf32(input, length);
}

} // namespace utf32
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/utf32.h */
/* begin file src/generic/base64lengths.h */
namespace simdutf {
namespace lsx {
namespace {
namespace base64_lengths {

simdutf_warn_unused size_t binary_length_from_base64(const char *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 64 <= length; pos += 64) {
    simd8x64<uint8_t> block(reinterpret_cast<const uint8_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

simdutf_warn_unused size_t binary_length_from_base64(const char16_t *input,
                                                     size_t length) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos + 32 <= length; pos += 32) {
    simd16x32<uint16_t> block(reinterpret_cast<const uint16_t *>(input + pos));
    uint64_t maybe_base64 = block.gteq(33); // >= 33 which is '!' in ASCII
    count += count_ones(maybe_base64);
  }
  while (pos < length) {
    count += (input[pos] > 0x20) ? 1 : 0;
    pos++;
  }
  // Count padding at the end.
  size_t padding = 0;
  pos = length;
  while (pos > 0 && padding < 2) {
    char16_t c = input[--pos];
    if (c == '=') {
      padding++;
    } else if (c > ' ') {
      break;
    }
  }
  return ((count - padding) * 3) / 4;
}

} // namespace base64_lengths
} // unnamed namespace
} // namespace lsx
} // namespace simdutf
/* end file src/generic/base64lengths.h */

//
// Implementation-specific overrides
//
namespace simdutf {
namespace lsx {

simdutf_warn_unused bool
implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return lsx::utf8_validation::generic_validate_utf8(buf, len);
}

simdutf_warn_unused result implementation::validate_utf8_with_errors(
    const char *buf, size_t len) const noexcept {
  return lsx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_ascii(const char *buf, size_t len) const noexcept {
  return lsx::ascii_validation::generic_validate_ascii(buf, len);
}

simdutf_warn_unused result implementation::validate_ascii_with_errors(
    const char *buf, size_t len) const noexcept {
  return lsx::ascii_validation::generic_validate_ascii_with_errors(buf, len);
}

simdutf_warn_unused bool
implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    // empty input is valid. protected the implementation from nullptr.
    return true;
  }
  const char32_t *tail = lsx_validate_utf32le(buf, len);
  if (tail) {
    return scalar::utf32::validate(tail, len - (tail - buf));
  } else {
    return false;
  }
}

simdutf_warn_unused result implementation::validate_utf32_with_errors(
    const char32_t *buf, size_t len) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, 0);
  }
  result res = lsx_validate_utf32le_with_errors(buf, len);
  if (res.count != len) {
    result scalar_res =
        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
    return result(scalar_res.error, res.count + scalar_res.count);
  } else {
    return res;
  }
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
    const char *buf, size_t len, char *utf8_output) const noexcept {
  std::pair<const char *, char *> ret =
      lsx_convert_latin1_to_utf8(buf, len, utf8_output);
  size_t converted_chars = ret.second - utf8_output;

  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  std::pair<const char *, char32_t *> ret =
      lsx_convert_latin1_to_utf32(buf, len, utf32_output);
  size_t converted_chars = ret.second - utf32_output;
  if (ret.first != buf + len) {
    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
        ret.first, len - (ret.first - buf), ret.second);
    converted_chars += scalar_converted_chars;
  }
  return converted_chars;
}

simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert(buf, len, latin1_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  utf8_to_latin1::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
    const char *buf, size_t len, char *latin1_output) const noexcept {
  return lsx::utf8_to_latin1::convert_valid(buf, len, latin1_output);
}

simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert(buf, len, utf32_output);
}

simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
  utf8_to_utf32::validating_transcoder converter;
  return converter.convert_with_errors(buf, len, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
    const char *input, size_t size, char32_t *utf32_output) const noexcept {
  return utf8_to_utf32::convert_valid(input, size, utf32_output);
}

simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return 0;
  }
  std::pair<const char32_t *, char *> ret =
      lsx_convert_utf32_to_utf8(buf, len, utf8_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - utf8_output;
  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  if (simdutf_unlikely(len == 0)) {
    return result(error_code::SUCCESS, 0);
  }
  // ret.first.count is always the position in the buffer, not the number of
  // code units written even if finished
  std::pair<result, char *> ret =
      lsx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
  if (ret.first.count != len) {
    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      utf8_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      lsx_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;

  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
        ret.first, len - (ret.first - buf), ret.second);
    if (scalar_saved_bytes == 0) {
      return 0;
    }
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<result, char *> ret =
      lsx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
  if (ret.first.error) {
    return ret.first;
  } // Can return directly since scalar fallback already found correct
    // ret.first.count
  if (ret.first.count != len) { // All good so far, but not finished
    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
        buf + ret.first.count, len - ret.first.count, ret.second);
    if (scalar_res.error) {
      scalar_res.count += ret.first.count;
      return scalar_res;
    } else {
      ret.second += scalar_res.count;
    }
  }
  ret.first.count =
      ret.second -
      latin1_output; // Set count to the number of 8-bit code units written
  return ret.first;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
  std::pair<const char32_t *, char *> ret =
      lsx_convert_utf32_to_latin1(buf, len, latin1_output);
  if (ret.first == nullptr) {
    return 0;
  }
  size_t saved_bytes = ret.second - latin1_output;

  if (ret.first != buf + len) {
    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
        ret.first, len - (ret.first - buf), ret.second);
    saved_bytes += scalar_saved_bytes;
  }
  return saved_bytes;
}

simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
  // optimization opportunity: implement a custom function.
  return convert_utf32_to_utf8(buf, len, utf8_output);
}

simdutf_warn_unused size_t
implementation::count_utf8(const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
    const char *buf, size_t len) const noexcept {
  return count_utf8(buf, len);
}

simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
    const char *input, size_t length) const noexcept {
  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
  const uint8_t *data_end = data + length;
  uint64_t result = 0;
  while (data_end - data > 16) {
    uint64_t two_bytes = 0;
    __m128i input_vec = __lsx_vld(data, 0);
    two_bytes =
        __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
    result += 16 + two_bytes;
    data += 16;
  }
  return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
                                                          data_end - data);
}

simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
    const char32_t *input, size_t length) const noexcept {
  return utf32::utf8_length_from_utf32(input, length);
}

simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
    const char *input, size_t length) const noexcept {
  return utf8::count_code_points(input, length);
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused result implementation::base64_to_binary(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

simdutf_warn_unused full_result implementation::base64_to_binary_details(
    const char16_t *input, size_t length, char *output, base64_options options,
    last_chunk_handling_options last_chunk_options) const noexcept {
  if (options & base64_default_or_url) {
    if (options == base64_options::base64_default_or_url_accept_garbage) {
      return compress_decode_base64<false, true, true>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, true>(
          output, input, length, options, last_chunk_options);
    }
  } else if (options & base64_url) {
    if (options == base64_options::base64_url_accept_garbage) {
      return compress_decode_base64<true, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<true, false, false>(
          output, input, length, options, last_chunk_options);
    }
  } else {
    if (options == base64_options::base64_default_accept_garbage) {
      return compress_decode_base64<false, true, false>(
          output, input, length, options, last_chunk_options);
    } else {
      return compress_decode_base64<false, false, false>(
          output, input, length, options, last_chunk_options);
    }
  }
}

size_t implementation::binary_to_base64(const char *input, size_t length,
                                        char *output,
                                        base64_options options) const noexcept {
  if (options & base64_url) {
    return encode_base64<true>(output, input, length, options);
  } else {
    return encode_base64<false>(output, input, length, options);
  }
}

size_t implementation::binary_to_base64_with_lines(
    const char *input, size_t length, char *output, size_t line_length,
    base64_options options) const noexcept {
  return scalar::base64::tail_encode_base64_impl<true>(output, input, length,
                                                       options, line_length);
}

const char *implementation::find(const char *start, const char *end,
                                 char character) const noexcept {
  return util_find(start, end, character);
}

const char16_t *implementation::find(const char16_t *start, const char16_t *end,
                                     char16_t character) const noexcept {
  return util_find(start, end, character);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

simdutf_warn_unused size_t implementation::binary_length_from_base64(
    const char16_t *input, size_t length) const noexcept {
  return base64_lengths::binary_length_from_base64(input, length);
}

} // namespace lsx
} // namespace simdutf

/* begin file src/simdutf/lsx/end.h */
#undef SIMDUTF_SIMD_HAS_UNSIGNED_CMP
/* end file src/simdutf/lsx/end.h */
/* end file src/lsx/implementation.cpp */
#endif

/* begin file src/simdutf_c.cpp */
/* begin file include/simdutf_c.h */
/***
 * simdutf_c.h.h - C API for simdutf
 * This is currently experimental.
 * We are committed to keeping the C API, but there might be mistakes in our
 * implementation. Please report any issues you find.
 */

#ifndef SIMDUTF_C_H
#define SIMDUTF_C_H

#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>

#ifdef __has_include
  #if __has_include(<uchar.h>)
    #include <uchar.h>
  #else // __has_include(<uchar.h>)
    #define char16_t uint16_t
    #define char32_t uint32_t
  #endif // __has_include(<uchar.h>)
#else    // __has_include(<uchar.h>)
  #define char16_t uint16_t
  #define char32_t uint32_t
#endif // __has_include

#ifdef __cplusplus
extern "C" {
#endif

/* C-friendly subset of simdutf errors */
typedef enum simdutf_error_code {
  SIMDUTF_ERROR_SUCCESS = 0,
  SIMDUTF_ERROR_HEADER_BITS,
  SIMDUTF_ERROR_TOO_SHORT,
  SIMDUTF_ERROR_TOO_LONG,
  SIMDUTF_ERROR_OVERLONG,
  SIMDUTF_ERROR_TOO_LARGE,
  SIMDUTF_ERROR_SURROGATE,
  SIMDUTF_ERROR_INVALID_BASE64_CHARACTER,
  SIMDUTF_ERROR_BASE64_INPUT_REMAINDER,
  SIMDUTF_ERROR_BASE64_EXTRA_BITS,
  SIMDUTF_ERROR_OUTPUT_BUFFER_TOO_SMALL,
  SIMDUTF_ERROR_OTHER
} simdutf_error_code;

typedef struct simdutf_result {
  simdutf_error_code error;
  size_t count; /* position of error or number of code units validated */
} simdutf_result;

typedef struct simdutf_full_result {
  simdutf_error_code error;
  size_t input_count;  /* number of input units consumed */
  size_t output_count; /* number of output bytes written */
} simdutf_full_result;

typedef enum simdutf_encoding_type {
  SIMDUTF_ENCODING_UNSPECIFIED = 0,
  SIMDUTF_ENCODING_UTF8 = 1,
  SIMDUTF_ENCODING_UTF16_LE = 2,
  SIMDUTF_ENCODING_UTF16_BE = 4,
  SIMDUTF_ENCODING_UTF32_LE = 8,
  SIMDUTF_ENCODING_UTF32_BE = 16
} simdutf_encoding_type;

/* Validate UTF-8: returns true iff input is valid UTF-8 */
bool simdutf_validate_utf8(const char *buf, size_t len);

/* Validate UTF-8 with detailed result */
simdutf_result simdutf_validate_utf8_with_errors(const char *buf, size_t len);

/* Encoding detection */
simdutf_encoding_type simdutf_autodetect_encoding(const char *input,
                                                  size_t length);
int simdutf_detect_encodings(const char *input, size_t length);

/* ASCII validation */
bool simdutf_validate_ascii(const char *buf, size_t len);
simdutf_result simdutf_validate_ascii_with_errors(const char *buf, size_t len);

/* UTF-16 ASCII checks */
bool simdutf_validate_utf16_as_ascii(const char16_t *buf, size_t len);
bool simdutf_validate_utf16be_as_ascii(const char16_t *buf, size_t len);
bool simdutf_validate_utf16le_as_ascii(const char16_t *buf, size_t len);

/* UTF-16/UTF-8/UTF-32 validation (native/endian-specific) */
bool simdutf_validate_utf16(const char16_t *buf, size_t len);
bool simdutf_validate_utf16le(const char16_t *buf, size_t len);
bool simdutf_validate_utf16be(const char16_t *buf, size_t len);
simdutf_result simdutf_validate_utf16_with_errors(const char16_t *buf,
                                                  size_t len);
simdutf_result simdutf_validate_utf16le_with_errors(const char16_t *buf,
                                                    size_t len);
simdutf_result simdutf_validate_utf16be_with_errors(const char16_t *buf,
                                                    size_t len);

bool simdutf_validate_utf32(const char32_t *buf, size_t len);
simdutf_result simdutf_validate_utf32_with_errors(const char32_t *buf,
                                                  size_t len);

/* to_well_formed UTF-16 helpers */
void simdutf_to_well_formed_utf16le(const char16_t *input, size_t len,
                                    char16_t *output);
void simdutf_to_well_formed_utf16be(const char16_t *input, size_t len,
                                    char16_t *output);
void simdutf_to_well_formed_utf16(const char16_t *input, size_t len,
                                  char16_t *output);

/* Counting */
size_t simdutf_count_utf16(const char16_t *input, size_t length);
size_t simdutf_count_utf16le(const char16_t *input, size_t length);
size_t simdutf_count_utf16be(const char16_t *input, size_t length);
size_t simdutf_count_utf8(const char *input, size_t length);

/* Length estimators */
size_t simdutf_utf8_length_from_latin1(const char *input, size_t length);
size_t simdutf_latin1_length_from_utf8(const char *input, size_t length);
size_t simdutf_latin1_length_from_utf16(size_t length);
size_t simdutf_latin1_length_from_utf32(size_t length);
size_t simdutf_utf16_length_from_utf8(const char *input, size_t length);
size_t simdutf_utf32_length_from_utf8(const char *input, size_t length);
size_t simdutf_utf8_length_from_utf16(const char16_t *input, size_t length);
size_t simdutf_utf8_length_from_utf32(const char32_t *input, size_t length);
simdutf_result
simdutf_utf8_length_from_utf16_with_replacement(const char16_t *input,
                                                size_t length);
size_t simdutf_utf8_length_from_utf16le(const char16_t *input, size_t length);
size_t simdutf_utf8_length_from_utf16be(const char16_t *input, size_t length);
simdutf_result
simdutf_utf8_length_from_utf16le_with_replacement(const char16_t *input,
                                                  size_t length);
simdutf_result
simdutf_utf8_length_from_utf16be_with_replacement(const char16_t *input,
                                                  size_t length);

/* Conversions: latin1 <-> utf8, utf8 <-> utf16/utf32, utf16 <-> utf8, etc. */
size_t simdutf_convert_latin1_to_utf8(const char *input, size_t length,
                                      char *output);
size_t simdutf_convert_latin1_to_utf8_safe(const char *input, size_t length,
                                           char *output, size_t utf8_len);
size_t simdutf_convert_latin1_to_utf16le(const char *input, size_t length,
                                         char16_t *output);
size_t simdutf_convert_latin1_to_utf16be(const char *input, size_t length,
                                         char16_t *output);
size_t simdutf_convert_latin1_to_utf16(const char *input, size_t length,
                                       char16_t *output);
size_t simdutf_convert_latin1_to_utf32(const char *input, size_t length,
                                       char32_t *output);

size_t simdutf_convert_utf8_to_latin1(const char *input, size_t length,
                                      char *output);
size_t simdutf_convert_utf8_to_utf16le(const char *input, size_t length,
                                       char16_t *output);
size_t simdutf_convert_utf8_to_utf16be(const char *input, size_t length,
                                       char16_t *output);
size_t simdutf_convert_utf8_to_utf16(const char *input, size_t length,
                                     char16_t *output);

size_t simdutf_convert_utf8_to_utf32(const char *input, size_t length,
                                     char32_t *output);
simdutf_result simdutf_convert_utf8_to_latin1_with_errors(const char *input,
                                                          size_t length,
                                                          char *output);
simdutf_result simdutf_convert_utf8_to_utf16_with_errors(const char *input,
                                                         size_t length,
                                                         char16_t *output);
simdutf_result simdutf_convert_utf8_to_utf16le_with_errors(const char *input,
                                                           size_t length,
                                                           char16_t *output);
simdutf_result simdutf_convert_utf8_to_utf16be_with_errors(const char *input,
                                                           size_t length,
                                                           char16_t *output);
simdutf_result simdutf_convert_utf8_to_utf32_with_errors(const char *input,
                                                         size_t length,
                                                         char32_t *output);

/* Conversions assuming valid input */
size_t simdutf_convert_valid_utf8_to_latin1(const char *input, size_t length,
                                            char *output);
size_t simdutf_convert_valid_utf8_to_utf16le(const char *input, size_t length,
                                             char16_t *output);
size_t simdutf_convert_valid_utf8_to_utf16be(const char *input, size_t length,
                                             char16_t *output);
size_t simdutf_convert_valid_utf8_to_utf32(const char *input, size_t length,
                                           char32_t *output);

/* UTF-16 -> UTF-8 and related conversions */
size_t simdutf_convert_utf16_to_utf8(const char16_t *input, size_t length,
                                     char *output);
size_t simdutf_convert_utf16le_to_utf8(const char16_t *input, size_t length,
                                       char *output);
size_t simdutf_convert_utf16be_to_utf8(const char16_t *input, size_t length,
                                       char *output);
size_t simdutf_convert_utf16_to_utf8_safe(const char16_t *input, size_t length,
                                          char *output, size_t utf8_len);
size_t simdutf_convert_utf16_to_latin1(const char16_t *input, size_t length,
                                       char *output);
size_t simdutf_convert_utf16le_to_latin1(const char16_t *input, size_t length,
                                         char *output);
size_t simdutf_convert_utf16be_to_latin1(const char16_t *input, size_t length,
                                         char *output);
simdutf_result
simdutf_convert_utf16_to_latin1_with_errors(const char16_t *input,
                                            size_t length, char *output);
simdutf_result
simdutf_convert_utf16le_to_latin1_with_errors(const char16_t *input,
                                              size_t length, char *output);
simdutf_result
simdutf_convert_utf16be_to_latin1_with_errors(const char16_t *input,
                                              size_t length, char *output);

simdutf_result simdutf_convert_utf16_to_utf8_with_errors(const char16_t *input,
                                                         size_t length,
                                                         char *output);
simdutf_result
simdutf_convert_utf16le_to_utf8_with_errors(const char16_t *input,
                                            size_t length, char *output);
simdutf_result
simdutf_convert_utf16be_to_utf8_with_errors(const char16_t *input,
                                            size_t length, char *output);

size_t simdutf_convert_valid_utf16_to_utf8(const char16_t *input, size_t length,
                                           char *output);
size_t simdutf_convert_valid_utf16_to_latin1(const char16_t *input,
                                             size_t length, char *output);
size_t simdutf_convert_valid_utf16le_to_latin1(const char16_t *input,
                                               size_t length, char *output);
size_t simdutf_convert_valid_utf16be_to_latin1(const char16_t *input,
                                               size_t length, char *output);

size_t simdutf_convert_valid_utf16le_to_utf8(const char16_t *input,
                                             size_t length, char *output);
size_t simdutf_convert_valid_utf16be_to_utf8(const char16_t *input,
                                             size_t length, char *output);

/* UTF-16 <-> UTF-32 conversions */
size_t simdutf_convert_utf16_to_utf32(const char16_t *input, size_t length,
                                      char32_t *output);
size_t simdutf_convert_utf16le_to_utf32(const char16_t *input, size_t length,
                                        char32_t *output);
size_t simdutf_convert_utf16be_to_utf32(const char16_t *input, size_t length,
                                        char32_t *output);
simdutf_result simdutf_convert_utf16_to_utf32_with_errors(const char16_t *input,
                                                          size_t length,
                                                          char32_t *output);
simdutf_result
simdutf_convert_utf16le_to_utf32_with_errors(const char16_t *input,
                                             size_t length, char32_t *output);
simdutf_result
simdutf_convert_utf16be_to_utf32_with_errors(const char16_t *input,
                                             size_t length, char32_t *output);

/* Valid UTF-16 conversions */
size_t simdutf_convert_valid_utf16_to_utf32(const char16_t *input,
                                            size_t length, char32_t *output);
size_t simdutf_convert_valid_utf16le_to_utf32(const char16_t *input,
                                              size_t length, char32_t *output);
size_t simdutf_convert_valid_utf16be_to_utf32(const char16_t *input,
                                              size_t length, char32_t *output);

/* UTF-32 -> ... conversions */
size_t simdutf_convert_utf32_to_utf8(const char32_t *input, size_t length,
                                     char *output);
simdutf_result simdutf_convert_utf32_to_utf8_with_errors(const char32_t *input,
                                                         size_t length,
                                                         char *output);
size_t simdutf_convert_valid_utf32_to_utf8(const char32_t *input, size_t length,
                                           char *output);

size_t simdutf_convert_utf32_to_utf16(const char32_t *input, size_t length,
                                      char16_t *output);
size_t simdutf_convert_utf32_to_utf16le(const char32_t *input, size_t length,
                                        char16_t *output);
size_t simdutf_convert_utf32_to_utf16be(const char32_t *input, size_t length,
                                        char16_t *output);
simdutf_result
simdutf_convert_utf32_to_latin1_with_errors(const char32_t *input,
                                            size_t length, char *output);

/* --- Find helpers --- */
const char *simdutf_find(const char *start, const char *end, char character);
const char16_t *simdutf_find_utf16(const char16_t *start, const char16_t *end,
                                   char16_t character);

/* --- Base64 enums and helpers --- */
typedef enum simdutf_base64_options {
  SIMDUTF_BASE64_DEFAULT = 0,
  SIMDUTF_BASE64_URL = 1,
  SIMDUTF_BASE64_DEFAULT_NO_PADDING = 2,
  SIMDUTF_BASE64_URL_WITH_PADDING = 3,
  SIMDUTF_BASE64_DEFAULT_ACCEPT_GARBAGE = 4,
  SIMDUTF_BASE64_URL_ACCEPT_GARBAGE = 5,
  SIMDUTF_BASE64_DEFAULT_OR_URL = 8,
  SIMDUTF_BASE64_DEFAULT_OR_URL_ACCEPT_GARBAGE = 12
} simdutf_base64_options;

typedef enum simdutf_last_chunk_handling_options {
  SIMDUTF_LAST_CHUNK_LOOSE = 0,
  SIMDUTF_LAST_CHUNK_STRICT = 1,
  SIMDUTF_LAST_CHUNK_STOP_BEFORE_PARTIAL = 2,
  SIMDUTF_LAST_CHUNK_ONLY_FULL_CHUNKS = 3
} simdutf_last_chunk_handling_options;

/* maximal binary length estimators */
size_t simdutf_maximal_binary_length_from_base64(const char *input,
                                                 size_t length);
size_t simdutf_maximal_binary_length_from_base64_utf16(const char16_t *input,
                                                       size_t length);

/* base64 decoding/encoding */
simdutf_result simdutf_base64_to_binary(
    const char *input, size_t length, char *output,
    simdutf_base64_options options,
    simdutf_last_chunk_handling_options last_chunk_options);
simdutf_result simdutf_base64_to_binary_utf16(
    const char16_t *input, size_t length, char *output,
    simdutf_base64_options options,
    simdutf_last_chunk_handling_options last_chunk_options);

size_t simdutf_base64_length_from_binary(size_t length,
                                         simdutf_base64_options options);
size_t simdutf_base64_length_from_binary_with_lines(
    size_t length, simdutf_base64_options options, size_t line_length);

size_t simdutf_binary_to_base64(const char *input, size_t length, char *output,
                                simdutf_base64_options options);
size_t simdutf_binary_to_base64_with_lines(const char *input, size_t length,
                                           char *output, size_t line_length,
                                           simdutf_base64_options options);

/* safe decoding that provides an in/out outlen parameter */
simdutf_result simdutf_base64_to_binary_safe(
    const char *input, size_t length, char *output, size_t *outlen,
    simdutf_base64_options options,
    simdutf_last_chunk_handling_options last_chunk_options,
    bool decode_up_to_bad_char);
simdutf_result simdutf_base64_to_binary_safe_utf16(
    const char16_t *input, size_t length, char *output, size_t *outlen,
    simdutf_base64_options options,
    simdutf_last_chunk_handling_options last_chunk_options,
    bool decode_up_to_bad_char);

/* detailed decoding returning input_count and output_count */
simdutf_full_result simdutf_base64_to_binary_details(
    const char *input, size_t length, char *output,
    simdutf_base64_options options,
    simdutf_last_chunk_handling_options last_chunk_options);
simdutf_full_result simdutf_base64_to_binary_details_utf16(
    const char16_t *input, size_t length, char *output,
    simdutf_base64_options options,
    simdutf_last_chunk_handling_options last_chunk_options);

/* single-character base64 validation */
bool simdutf_base64_valid(char input, simdutf_base64_options options);
bool simdutf_base64_valid_utf16(char16_t input, simdutf_base64_options options);

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* SIMDUTF_C_H */
/* end file include/simdutf_c.h */

static simdutf_result to_c_result(const simdutf::result &r) {
  simdutf_result out;
  out.error = static_cast<simdutf_error_code>(r.error);
  out.count = r.count;
  return out;
}

/* The C wrapper depends on the library features. Only expose the C API
   when all relevant feature is enabled. This helps the
   single-header generator to omit the C wrapper when features are
   disabled. */
// clang-format off
// clang-format on
/* end file src/simdutf_c.cpp */
SIMDUTF_POP_DISABLE_WARNINGS
/* end file src/simdutf.cpp */