namespace simdutf {
namespace SIMDUTF_IMPLEMENTATION {
namespace {
namespace utf16 {

template <endianness big_endian>
simdutf_really_inline size_t count_code_points(const char16_t *in,
                                               size_t size) {
  size_t pos = 0;
  size_t count = 0;
  for (; pos < size / 32 * 32; pos += 32) {
    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
    if (!match_system(big_endian)) {
      input.swap_bytes();
    }
    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
    count += count_ones(not_pair) / 2;
  }
  return count +
         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
}

template <endianness big_endian>
simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
                                                    size_t size) {
  size_t pos = 0;
  size_t count = 0;
  // This algorithm could no doubt be improved!
  for (; pos < size / 32 * 32; pos += 32) {
    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
    if (!match_system(big_endian)) {
      input.swap_bytes();
    }
    uint64_t ascii_mask = input.lteq(0x7F);
    uint64_t twobyte_mask = input.lteq(0x7FF);
    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);

    size_t ascii_count = count_ones(ascii_mask) / 2;
    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
             ascii_count;
  }
  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
                                                                   size - pos);
}

#ifdef SIMDUTF_SIMD_HAS_BYTEMASK
template <endianness big_endian>
simdutf_really_inline size_t utf8_length_from_utf16_bytemask(const char16_t *in,
                                                             size_t size) {
  size_t pos = 0;

  using vector_u16 = simd16<uint16_t>;
  constexpr size_t N = vector_u16::ELEMENTS;

  const auto one = vector_u16::splat(1);

  auto v_count = vector_u16::zero();

  // each char16 yields at least one byte
  size_t count = size / N * N;

  // in a single iteration the increment is 0, 1 or 2, despite we have
  // three additions
  constexpr size_t max_iterations = 65535 / 2;
  size_t iteration = max_iterations;

  for (; pos < size / N * N; pos += N) {
    auto input = vector_u16::load(reinterpret_cast<const uint16_t *>(in + pos));
    if (!match_system(big_endian)) {
      input = input.swap_bytes();
    }

    // 0xd800 .. 0xdbff - low surrogate
    // 0xdc00 .. 0xdfff - high surrogate
    const auto is_surrogate = ((input & uint16_t(0xf800)) == uint16_t(0xd800));

    // c0 - chars that yield 2- or 3-byte UTF-8 codes
    const auto c0 = min(input & uint16_t(0xff80), one);

    // c1 - chars that yield 3-byte UTF-8 codes (including surrogates)
    const auto c1 = min(input & uint16_t(0xf800), one);

    /*
        Explanation how the counting works.

        In the case of a non-surrogate character we count:
        * always 1 -- see how `count` is initialized above;
        * c0 = 1 if the current char yields 2 or 3 bytes;
        * c1 = 1 if the current char yields 3 bytes.

        Thus, we always have correct count for the current char:
        from 1, 2 or 3 bytes.

        A trickier part is how we count surrogate pairs. Whether
        we encounter a surrogate (low or high), we count it as
        3 chars and then minus 1 (`is_surrogate` is -1 or 0).
        Each surrogate char yields 2. A surrogate pair, that
        is a low surrogate followed by a high one, yields
        the expected 4 bytes.

        It also correctly handles cases when low surrogate is
        processed by the this loop, but high surrogate is counted
        by the scalar procedure. The scalar procedure uses exactly
        the described approach, thanks to that for valid UTF-16
        strings it always count correctly.
    */
    v_count += c0;
    v_count += c1;
    v_count += vector_u16(is_surrogate);

    iteration -= 1;
    if (iteration == 0) {
      count += v_count.sum();
      v_count = vector_u16::zero();

      iteration = max_iterations;
    }
  }

  if (iteration > 0) {
    count += v_count.sum();
  }

  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
                                                                   size - pos);
}
#endif // SIMDUTF_SIMD_HAS_BYTEMASK

template <endianness big_endian>
simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
                                                     size_t size) {
  return count_code_points<big_endian>(in, size);
}

simdutf_really_inline void
change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
  size_t pos = 0;

  while (pos < size / 32 * 32) {
    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
    input.swap_bytes();
    input.store(reinterpret_cast<uint16_t *>(output));
    pos += 32;
    output += 32;
  }

  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
}

} // namespace utf16
} // unnamed namespace
} // namespace SIMDUTF_IMPLEMENTATION
} // namespace simdutf
