yalantinglibs/include/ylt/standalone/cinatra/picohttpparser.h

/*
 * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase,
 *                         Shigeo Mitsunari
 *
 * The software is licensed under either the MIT License (below) or the Perl
 * license.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifndef picohttpparser_h
#define picohttpparser_h
#include <assert.h>
#include <stddef.h>
#include <string.h>

#include <string_view>

#ifdef CINATRA_SSE
#ifdef _MSC_VER
#include <nmmintrin.h>
#else
#include <x86intrin.h>
#endif
#endif

#ifdef CINATRA_AVX2
#include <immintrin.h>
#endif

#ifdef CINATRA_ARM_OPT
#include <arm_neon.h>
#endif

#ifdef _MSC_VER
#define ssize_t intptr_t
#else
#include <sys/types.h>
#endif

namespace cinatra {
struct http_header {
  std::string_view name;
  std::string_view value;
};
namespace detail {

/* contains name and value of a header (name == NULL if is a continuing line
 * of a multiline header */

/* returns number of bytes consumed if successful, -2 if request is partial,
 * -1 if failed */
// int phr_parse_request(const char *buf, size_t len, const char **method,
// size_t *method_len, const char **path, size_t *path_len,
//                      int *minor_version, struct phr_header *headers, size_t
//                      *num_headers, size_t last_len);

/* ditto */
// int phr_parse_response(const char *_buf, size_t len, int *minor_version, int
// *status, const char **msg, size_t *msg_len,
//                       struct phr_header *headers, size_t *num_headers, size_t
//                       last_len);

/* ditto */
// int phr_parse_headers(const char *buf, size_t len, struct phr_header
// *headers, size_t *num_headers, size_t last_len);

/* should be zero-filled before start */
struct phr_chunked_decoder {
  size_t bytes_left_in_chunk; /* number of bytes left in current chunk */
  char consume_trailer;       /* if trailing headers should be consumed */
  char _hex_count;
  char _state;
};

/* the function rewrites the buffer given as (buf, bufsz) removing the chunked-
 * encoding headers.  When the function returns without an error, bufsz is
 * updated to the length of the decoded data available.  Applications should
 * repeatedly call the function while it returns -2 (incomplete) every time
 * supplying newly arrived data.  If the end of the chunked-encoded data is
 * found, the function returns a non-negative number indicating the number of
 * octets left undecoded at the tail of the supplied buffer.  Returns -1 on
 * error.
 */
// ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf,
// size_t *bufsz);

/* returns if the chunked decoder is in middle of chunked data */
// int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder);

/* $Id: a707070d11d499609f99d09f97535642cec910a8 $ */

#if __GNUC__ >= 3
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define likely(x) (x)
#define unlikely(x) (x)
#endif

#ifdef _MSC_VER
#define ALIGNED(n) _declspec(align(n))
#else
#define ALIGNED(n) __attribute__((aligned(n)))
#endif

#define IS_PRINTABLE_ASCII(c) ((unsigned char)(c)-040u < 0137u)

#define CHECK_EOF()     \
  if (buf == buf_end) { \
    *ret = -2;          \
    return NULL;        \
  }

#define EXPECT_CHAR_NO_CHECK(ch) \
  if (*buf++ != ch) {            \
    *ret = -1;                   \
    return NULL;                 \
  }

#define EXPECT_CHAR(ch) \
  CHECK_EOF();          \
  EXPECT_CHAR_NO_CHECK(ch);

#ifdef CINATRA_ARM_OPT
#define ADVANCE_TOKEN(tok, toklen)                            \
  do {                                                        \
    const char *tok_start = buf;                              \
    int found2;                                               \
    buf = findchar_nonprintable_fast(buf, buf_end, &found2);  \
    if (!found2) {                                            \
      CHECK_EOF();                                            \
    }                                                         \
    while (1) {                                               \
      if (*buf == ' ') {                                      \
        break;                                                \
      }                                                       \
      else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) {         \
        if ((unsigned char)*buf < '\040' || *buf == '\177') { \
          *ret = -1;                                          \
          return NULL;                                        \
        }                                                     \
      }                                                       \
      ++buf;                                                  \
      CHECK_EOF();                                            \
    }                                                         \
    tok = tok_start;                                          \
    toklen = buf - tok_start;                                 \
  } while (0)
#else
#define ADVANCE_TOKEN(tok, toklen)                                            \
  do {                                                                        \
    const char *tok_start = buf;                                              \
    static const char ALIGNED(16) ranges2[] = "\000\040\177\177";             \
    int found2;                                                               \
    buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
    if (!found2) {                                                            \
      CHECK_EOF();                                                            \
    }                                                                         \
    while (1) {                                                               \
      if (*buf == ' ') {                                                      \
        break;                                                                \
      }                                                                       \
      else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) {                         \
        if ((unsigned char)*buf < '\040' || *buf == '\177') {                 \
          *ret = -1;                                                          \
          return NULL;                                                        \
        }                                                                     \
      }                                                                       \
      ++buf;                                                                  \
      CHECK_EOF();                                                            \
    }                                                                         \
    tok = tok_start;                                                          \
    toklen = buf - tok_start;                                                 \
  } while (0)
#endif

static const char *token_char_map =
    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
    "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0"
    "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1"
    "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0"
    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
    "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";

static const char *findchar_fast(const char *buf, const char *buf_end,
                                 const char *ranges, int ranges_size,
                                 int *found) {
  *found = 0;
#ifdef CINATRA_SSE
  if (likely(buf_end - buf >= 16)) {
    __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges);

    size_t left = (buf_end - buf) & ~15;
    do {
      __m128i b16 = _mm_loadu_si128((const __m128i *)buf);
      int r = _mm_cmpestri(
          ranges16, ranges_size, b16, 16,
          _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS);
      if (unlikely(r != 16)) {
        buf += r;
        *found = 1;
        break;
      }
      buf += 16;
      left -= 16;
    } while (likely(left != 0));
  }
#else
  /* suppress unused parameter warning */
  (void)buf_end;
  (void)ranges;
  (void)ranges_size;
#endif
  return buf;
}

static const char *findchar_nonprintable_fast(const char *buf,
                                              const char *buf_end, int *found) {
#ifdef CINATRA_ARM_OPT
  *found = 0;

  const size_t block_size = sizeof(uint8x16_t) - 1;
  const char *const end =
      (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf;

  for (; buf < end; buf += sizeof(uint8x16_t)) {
    uint8x16_t v = vld1q_u8((const uint8_t *)buf);

    v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')),
                 vceqq_u8(v, vmovq_n_u8('\177')));

    /* Pack the comparison result into 64 bits. */
    const uint8x8_t rv = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
    uint64_t offset = vget_lane_u64(vreinterpret_u64_u8(rv), 0);

    if (offset) {
      *found = 1;
      __asm__("rbit %x0, %x0" : "+r"(offset));
      static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
                    "Need the number of leading 0-bits in uint64_t.");
      /* offset uses 4 bits per byte of input. */
      buf += __builtin_clzll(offset) / 4;
      break;
    }
  }

  return buf;
#else
  static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";

  return findchar_fast(buf, buf_end, ranges2, 4, found);
#endif
}

static const char *get_token_to_eol(const char *buf, const char *buf_end,
                                    const char **token, size_t *token_len,
                                    int *ret) {
  const char *token_start = buf;
#ifdef CINATRA_SSE
  static const char ranges1[] =
      "\0\010"
      /* allow HT */
      "\012\037"
      /* allow SP and up to but not including DEL */
      "\177\177"
      /* allow chars w. MSB set */
      ;
  int found;
  buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
  if (found)
    goto FOUND_CTL;
#elif defined(CINATRA_ARM_OPT)
  const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
  const char *const end =
      (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf;

  for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
    const uint8x16_t space = vmovq_n_u8('\040');
    const uint8x16_t threshold = vmovq_n_u8(0137u);
    const uint8x16_t v1 = vld1q_u8((const uint8_t *)buf);
    const uint8x16_t v2 = vld1q_u8((const uint8_t *)buf + sizeof(v1));
    uint8x16_t v3 = vsubq_u8(v1, space);
    uint8x16_t v4 = vsubq_u8(v2, space);

    v3 = vcgeq_u8(v3, threshold);
    v4 = vcgeq_u8(v4, threshold);
    v3 = vorrq_u8(v3, v4);
    /* Pack the comparison result into half a vector, i.e. 64 bits. */
    v3 = vpmaxq_u8(v3, v3);

    if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
      const uint8x16_t del = vmovq_n_u8('\177');
      /* This mask makes it possible to pack the comparison results into half a
       * vector, which has the same size as uint64_t. */
      const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
      const uint8x16_t tab = vmovq_n_u8('\011');

      v3 = vcltq_u8(v1, space);
      v4 = vcltq_u8(v2, space);
      v3 = vbicq_u8(v3, vceqq_u8(v1, tab));
      v4 = vbicq_u8(v4, vceqq_u8(v2, tab));
      v3 = vorrq_u8(v3, vceqq_u8(v1, del));
      v4 = vorrq_u8(v4, vceqq_u8(v2, del));
      /* After masking, four consecutive bytes in the results do not have the
       * same bits set. */
      v3 = vandq_u8(v3, mask);
      v4 = vandq_u8(v4, mask);
      /* Pack the comparison results into 128, and then 64 bits. */
      v3 = vpaddq_u8(v3, v4);
      v3 = vpaddq_u8(v3, v3);

      uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);

      if (offset) {
        __asm__("rbit %x0, %x0" : "+r"(offset));
        static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
                      "Need the number of leading 0-bits in uint64_t.");
        /* offset uses 2 bits per byte of input. */
        buf += __builtin_clzll(offset) / 2;
        goto FOUND_CTL;
      }
    }
  }
#else
  /* find non-printable char within the next 8 bytes, this is the hottest code;
   * manually inlined */
  while (likely(buf_end - buf >= 8)) {
#define DOIT()                               \
  do {                                       \
    if (unlikely(!IS_PRINTABLE_ASCII(*buf))) \
      goto NonPrintable;                     \
    ++buf;                                   \
  } while (0)
    DOIT();
    DOIT();
    DOIT();
    DOIT();
    DOIT();
    DOIT();
    DOIT();
    DOIT();
#undef DOIT
    continue;
  NonPrintable:
    if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) ||
        unlikely(*buf == '\177')) {
      goto FOUND_CTL;
    }
    ++buf;
  }
#endif
  for (;; ++buf) {
    CHECK_EOF();
    if (unlikely(!IS_PRINTABLE_ASCII(*buf))) {
      if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) ||
          unlikely(*buf == '\177')) {
        goto FOUND_CTL;
      }
    }
  }
FOUND_CTL:
  if (likely(*buf == '\015')) {
    ++buf;
    EXPECT_CHAR('\012');
    *token_len = buf - 2 - token_start;
  }
  else if (*buf == '\012') {
    *token_len = buf - token_start;
    ++buf;
  }
  else {
    *ret = -1;
    return NULL;
  }
  *token = token_start;

  return buf;
}

static const char *is_complete(const char *buf, const char *buf_end,
                               size_t last_len, int *ret) {
  int ret_cnt = 0;
  buf = last_len < 3 ? buf : buf + last_len - 3;

  while (1) {
    CHECK_EOF();
    if (*buf == '\015') {
      ++buf;
      CHECK_EOF();
      EXPECT_CHAR('\012');
      ++ret_cnt;
    }
    else if (*buf == '\012') {
      ++buf;
      ++ret_cnt;
    }
    else {
      ++buf;
      ret_cnt = 0;
    }
    if (ret_cnt == 2) {
      return buf;
    }
  }

  *ret = -2;
  return NULL;
}

#define PARSE_INT(valp_, mul_)    \
  if (*buf < '0' || '9' < *buf) { \
    buf++;                        \
    *ret = -1;                    \
    return NULL;                  \
  }                               \
  *(valp_) = (mul_) * (*buf++ - '0');

#define PARSE_INT_3(valp_) \
  do {                     \
    int res_ = 0;          \
    PARSE_INT(&res_, 100)  \
    *valp_ = res_;         \
    PARSE_INT(&res_, 10)   \
    *valp_ += res_;        \
    PARSE_INT(&res_, 1)    \
    *valp_ += res_;        \
  } while (0)

/* returned pointer is always within [buf, buf_end), or null */
static const char *parse_http_version(const char *buf, const char *buf_end,
                                      int *minor_version, int *ret) {
  /* we want at least [HTTP/1.<two chars>] to try to parse */
  if (buf_end - buf < 9) {
    *ret = -2;
    return NULL;
  }
  EXPECT_CHAR_NO_CHECK('H');
  EXPECT_CHAR_NO_CHECK('T');
  EXPECT_CHAR_NO_CHECK('T');
  EXPECT_CHAR_NO_CHECK('P');
  EXPECT_CHAR_NO_CHECK('/');
  EXPECT_CHAR_NO_CHECK('1');
  EXPECT_CHAR_NO_CHECK('.');
  PARSE_INT(minor_version, 1);
  return buf;
}

#ifdef CINATRA_AVX2
static unsigned long TZCNT(unsigned long long in) {
  unsigned long res;
  asm("tzcnt %1, %0\n\t" : "=r"(res) : "r"(in));
  return res;
}
/* Parse only 32 bytes */
static void find_ranges32(__m256i b0, unsigned long *range0,
                          unsigned long *range1) {
  const __m256i rr0 = _mm256_set1_epi8(0x00 - 1);
  const __m256i rr1 = _mm256_set1_epi8(0x1f + 1);
  const __m256i rr2 = _mm256_set1_epi8(0x3a);
  const __m256i rr4 = _mm256_set1_epi8(0x7f);
  const __m256i rr7 = _mm256_set1_epi8(0x09);

  /* 0<=x */
  __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
  /* 0=<x<=1f */
  __m256i z_1f_0 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b0), gz0);
  /* 0<=x<=1f || x==3a */
  __m256i range0_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b0), z_1f_0);
  /* 0<=x<9 || 9<x<=1f || x==7f */
  __m256i range1_0 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b0),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b0, rr7), z_1f_0));
  /* Generate bit masks */
  unsigned int r0 = _mm256_movemask_epi8(range0_0);
  /* Combine 32bit masks into a single 64bit mask */
  *range0 = r0;
  r0 = _mm256_movemask_epi8(range1_0);
  *range1 = r0;
}

/* Parse only 64 bytes */
static void find_ranges64(__m256i b0, __m256i b1, unsigned long *range0,
                          unsigned long *range1) {
  const __m256i rr0 = _mm256_set1_epi8(0x00 - 1);
  const __m256i rr1 = _mm256_set1_epi8(0x1f + 1);
  const __m256i rr2 = _mm256_set1_epi8(0x3a);
  const __m256i rr4 = _mm256_set1_epi8(0x7f);
  const __m256i rr7 = _mm256_set1_epi8(0x09);

  /* 0<=x */
  __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
  __m256i gz1 = _mm256_cmpgt_epi8(b1, rr0);
  /* 0=<x<=1f */
  __m256i z_1f_0 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b0), gz0);
  __m256i z_1f_1 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b1), gz1);
  /* 0<=x<=1f || x==3a */
  __m256i range0_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b0), z_1f_0);
  __m256i range0_1 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b1), z_1f_1);
  /* 0<=x<9 || 9<x<=1f || x==7f */
  __m256i range1_0 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b0),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b0, rr7), z_1f_0));
  __m256i range1_1 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b1),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b1, rr7), z_1f_1));
  /* Generate bit masks */
  unsigned int r0 = _mm256_movemask_epi8(range0_0);
  unsigned int r1 = _mm256_movemask_epi8(range0_1);
  /* Combine 32bit masks into a single 64bit mask */
  *range0 = r0 ^ ((unsigned long)r1 << 32);
  r0 = _mm256_movemask_epi8(range1_0);
  r1 = _mm256_movemask_epi8(range1_1);
  *range1 = r0 ^ ((unsigned long)r1 << 32);
}

/* This function parses 128 bytes at a time, creating bitmap of all interesting
 * tokens */
static void find_ranges(const char *buf, const char *buf_end,
                        unsigned long *range0, unsigned long *range1) {
  const __m256i rr0 = _mm256_set1_epi8(0x00 - 1);
  const __m256i rr1 = _mm256_set1_epi8(0x1f + 1);
  const __m256i rr2 = _mm256_set1_epi8(0x3a);
  const __m256i rr4 = _mm256_set1_epi8(0x7f);
  const __m256i rr7 = _mm256_set1_epi8(0x09);

  __m256i b0, b1, b2, b3;
  unsigned char tmpbuf[32];
  int i;
  int dist;

  if ((dist = buf_end - buf) < 128) {
    // memcpy(tmpbuf, buf + (dist & (-32)), dist & 31);
    for (i = 0; i < (dist & 31); i++) tmpbuf[i] = buf[(dist & (-32)) + i];
    if (dist >= 96) {
      b0 = _mm256_loadu_si256((void *)buf + 32 * 0);
      b1 = _mm256_loadu_si256((void *)buf + 32 * 1);
      b2 = _mm256_loadu_si256((void *)buf + 32 * 2);
      b3 = _mm256_loadu_si256((void *)tmpbuf);
    }
    else if (dist >= 64) {
      b0 = _mm256_loadu_si256((void *)buf + 32 * 0);
      b1 = _mm256_loadu_si256((void *)buf + 32 * 1);
      b2 = _mm256_loadu_si256((void *)tmpbuf);
      b3 = _mm256_setzero_si256();
    }
    else {
      if (dist < 32) {
        b0 = _mm256_loadu_si256((void *)tmpbuf);
        return find_ranges32(b0, range0, range1);
      }
      else {
        b0 = _mm256_loadu_si256((void *)buf + 32 * 0);
        b1 = _mm256_loadu_si256((void *)tmpbuf);
        return find_ranges64(b0, b1, range0, range1);
      }
    }
  }
  else {
    /* Load 128 bytes */
    b0 = _mm256_loadu_si256((void *)buf + 32 * 0);
    b1 = _mm256_loadu_si256((void *)buf + 32 * 1);
    b2 = _mm256_loadu_si256((void *)buf + 32 * 2);
    b3 = _mm256_loadu_si256((void *)buf + 32 * 3);
  }

  /* 0<=x */
  __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0);
  __m256i gz1 = _mm256_cmpgt_epi8(b1, rr0);
  __m256i gz2 = _mm256_cmpgt_epi8(b2, rr0);
  __m256i gz3 = _mm256_cmpgt_epi8(b3, rr0);
  /* 0=<x<=1f */
  __m256i z_1f_0 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b0), gz0);
  __m256i z_1f_1 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b1), gz1);
  __m256i z_1f_2 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b2), gz2);
  __m256i z_1f_3 = _mm256_and_si256(_mm256_cmpgt_epi8(rr1, b3), gz3);
  /* 0<=x<=1f || x==3a */
  __m256i range0_0 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b0), z_1f_0);
  __m256i range0_1 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b1), z_1f_1);
  __m256i range0_2 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b2), z_1f_2);
  __m256i range0_3 = _mm256_or_si256(_mm256_cmpeq_epi8(rr2, b3), z_1f_3);
  /* 0<=x<9 || 9<x<=1f || x==7f */
  __m256i range1_0 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b0),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b0, rr7), z_1f_0));
  __m256i range1_1 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b1),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b1, rr7), z_1f_1));
  __m256i range1_2 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b2),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b2, rr7), z_1f_2));
  __m256i range1_3 =
      _mm256_or_si256(_mm256_cmpeq_epi8(rr4, b3),
                      _mm256_andnot_si256(_mm256_cmpeq_epi8(b3, rr7), z_1f_3));
  /* Generate bit masks */
  unsigned int r0 = _mm256_movemask_epi8(range0_0);
  unsigned int r1 = _mm256_movemask_epi8(range0_1);
  /* Combine 32bit masks into a single 64bit mask */
  *range0 = r0 ^ ((unsigned long)r1 << 32);

  r0 = _mm256_movemask_epi8(range0_2);
  r1 = _mm256_movemask_epi8(range0_3);
  range0[1] = r0 ^ ((unsigned long)r1 << 32);

  r0 = _mm256_movemask_epi8(range1_0);
  r1 = _mm256_movemask_epi8(range1_1);

  *range1 = r0 ^ ((unsigned long)r1 << 32);
  r0 = _mm256_movemask_epi8(range1_2);
  r1 = _mm256_movemask_epi8(range1_3);

  range1[1] = r0 ^ ((unsigned long)r1 << 32);
}

static const char *parse_headers(const char *buf, const char *buf_end,
                                 http_header *headers, size_t *num_headers,
                                 size_t max_headers, int *ret) {
  /* Bitmap for the first type of tokens */
  unsigned long rr0[2] = {0};
  /* Bitmap for the second type of tokens */
  unsigned long rr1[2] = {0};
  /* Pointer to the start of the currently parsed block of 128 bytes */
  const char *prep_start = NULL;
  int found;
  int n_headers = *num_headers;

  for (;; ++n_headers) {
    const char *name;
    size_t name_len;
    const char *value;
    size_t value_len;
    CHECK_EOF();
    if (*buf == '\015') {
      ++buf;
      EXPECT_CHAR('\012');
      break;
    }
    else if (*buf == '\012') {
      ++buf;
      break;
    }
    if (n_headers == max_headers) {
      *ret = -1;
      *num_headers = n_headers;
      return NULL;
    }

    if (!(n_headers != 0 && (*buf == ' ' || *buf == '\t')) &&
        !(*buf >= 65 && *buf <= 90)) {
      if (!token_char_map[(unsigned char)*buf]) {
        *ret = -1;
        *num_headers = n_headers;
        return NULL;
      }
      name = buf;

      /* Attempt to find a match in the index */
      found = 0;
      do {
        unsigned long distance = buf - prep_start;
        /* Check if the bitmaps are still valid. An assumption I make is that
           buf > 128 (i.e. the os will never allocate memory at address 0-128 */
        if (unlikely(distance >=
                     128)) { /* Bitmaps are too old, make new ones */
          prep_start = buf;
          distance = 0;
          find_ranges(buf, buf_end, rr0, rr1);
        }
        else if (distance >= 64) { /* In the second half of the bitmap */
          unsigned long index =
              rr0[1] >> (distance - 64);     /* Correct offset of the bitmap */
          unsigned long find = TZCNT(index); /* Fine next set bit */
          if ((find < 64)) {                 /* Yey, we found a token */
            buf += find;
            found = 1;
            break;
          }
          buf = prep_start + 128; /* No token was found in the current bitmap */
          continue;
        }
        unsigned long index =
            rr0[0] >> (distance);          /* In the first half of the bitmap */
        unsigned long find = TZCNT(index); /* Find next set bit */
        if ((find < 64)) {                 /* Token found */
          buf += find;
          found = 1;
          break;
        } /* Token not found, look at second half of bitmap */
        index = rr0[1];
        find = TZCNT(index);
        if ((find < 64)) {
          buf += 64 + find - distance;
          found = 1;
          break;
        }

        buf = prep_start + 128;
      } while (buf < buf_end);

      if (!found)
        if (buf >= buf_end) {
          *ret = -2;
          *num_headers = n_headers;
          return NULL;
        }
      name_len = buf - name;
      ++buf;
      CHECK_EOF();
      while ((*buf == ' ' || *buf == '\t')) {
        ++buf;
        CHECK_EOF();
      }
    }
    else {
      name = NULL;
      name_len = 0;
    }
    const char *token_start = buf;

    found = 0;

    do {
      /* Too far */
      unsigned long distance = buf - prep_start; /* Same algorithm as above */
      if (unlikely(distance >= 128)) {
        prep_start = buf;
        distance = 0;
        find_ranges(buf, buf_end, rr0, rr1);
      }
      else if (distance >= 64) {
        unsigned long index = rr1[1] >> (distance - 64);
        unsigned long find = TZCNT(index);
        if ((find < 64)) {
          buf += find;
          found = 1;
          break;
        }
        buf = prep_start + 128;
        continue;
      }
      unsigned long index = rr1[0] >> (distance);
      unsigned long find = TZCNT(index);
      if ((find < 64)) {
        buf += find;
        found = 1;
        break;
      }
      index = rr1[1];
      find = TZCNT(index);
      if ((find < 64)) {
        buf += 64 + find - distance;
        found = 1;
        break;
      }

      buf = prep_start + 128;
    } while (buf < buf_end);

    if (!found)
      if (buf >= buf_end) {
        *ret = -2;
        *num_headers = n_headers;
        return NULL;
      }

    unsigned short two_char = *(unsigned short *)buf;

    if (likely(two_char == 0x0a0d)) {
      value_len = buf - token_start;
      buf += 2;
    }
    else if (unlikely(two_char & 0x0a == 0x0a)) {
      value_len = buf - token_start;
      ++buf;
    }
    else {
      *ret = -1;
      *num_headers = n_headers;
      return NULL;
    }
    value = token_start;
    headers[*num_headers] = {std::string_view{name, name_len},
                             std::string_view{value, value_len}};
  }
  *num_headers = n_headers;
  return buf;
}

#else

static const char *parse_headers(const char *buf, const char *buf_end,
                                 http_header *headers, size_t *num_headers,
                                 size_t max_headers, int *ret,
                                 bool &has_connection, bool &has_close,
                                 bool &has_upgrade) {
  for (;; ++*num_headers) {
    const char *name;
    size_t name_len;
    const char *value;
    size_t value_len;
    CHECK_EOF();
    if (*buf == '\015') {
      ++buf;
      EXPECT_CHAR('\012');
      break;
    }
    else if (*buf == '\012') {
      ++buf;
      break;
    }
    if (*num_headers == max_headers) {
      *ret = -1;
      return NULL;
    }
    if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) {
      /* parsing name, but do not discard SP before colon, see
       * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */
      name = buf;
      static const char ALIGNED(16) ranges1[] =
          "\x00 "  /* control chars and up to SP */
          "\"\""   /* 0x22 */
          "()"     /* 0x28,0x29 */
          ",,"     /* 0x2c */
          "//"     /* 0x2f */
          ":@"     /* 0x3a-0x40 */
          "[]"     /* 0x5b-0x5d */
          "{\377"; /* 0x7b-0xff */
      int found;
      buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
      if (!found) {
        CHECK_EOF();
      }
      while (1) {
        if (*buf == ':') {
          break;
        }
        else if (!token_char_map[(unsigned char)*buf]) {
          *ret = -1;
          return NULL;
        }
        ++buf;
        CHECK_EOF();
      }
      if ((name_len = buf - name) == 0) {
        *ret = -1;
        return NULL;
      }
      ++buf;
      for (;; ++buf) {
        CHECK_EOF();
        if (!(*buf == ' ' || *buf == '\t')) {
          break;
        }
      }
    }
    else {
      name = NULL;
      name_len = 0;
    }
    if ((buf = get_token_to_eol(buf, buf_end, &value, &value_len, ret)) ==
        NULL) {
      return NULL;
    }
    if (name_len == 10) {
      if (memcmp(name + 1, "onnection", name_len - 1) == 0) {
        // has connection
        has_connection = true;
        char ch = *value;
        if (ch == 'U') {
          // has upgrade
          has_upgrade = true;
        }
        else if (ch == 'c' || ch == 'C') {
          // has_close
          has_close = true;
        }
      }
    }
    headers[*num_headers] = {std::string_view{name, name_len},
                             std::string_view{value, value_len}};
  }
  return buf;
}

#endif

#define ADVANCE_PATH(tok, toklen, has_query)                                  \
  do {                                                                        \
    const char *tok_start = buf;                                              \
    static const char ALIGNED(16) ranges2[] = "\000\040\177\177";             \
    int found2;                                                               \
    buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
    if (!found2) {                                                            \
      CHECK_EOF();                                                            \
    }                                                                         \
    while (1) {                                                               \
      if (*buf == ' ') {                                                      \
        break;                                                                \
      }                                                                       \
      else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) {                         \
        if ((unsigned char)*buf < '\040' || *buf == '\177') {                 \
          *ret = -1;                                                          \
          return NULL;                                                        \
        }                                                                     \
      }                                                                       \
      else if (unlikely(*buf == '?')) {                                       \
        has_query = true;                                                     \
      }                                                                       \
      ++buf;                                                                  \
      CHECK_EOF();                                                            \
    }                                                                         \
    tok = tok_start;                                                          \
    toklen = buf - tok_start;                                                 \
  } while (0)

static const char *parse_request(
    const char *buf, const char *buf_end, const char **method,
    size_t *method_len, const char **path, size_t *path_len, int *minor_version,
    http_header *headers, size_t *num_headers, size_t max_headers, int *ret,
    bool &has_connection, bool &has_close, bool &has_upgrade, bool &has_query) {
  /* skip first empty line (some clients add CRLF after POST content) */
  CHECK_EOF();
  if (*buf == '\015') {
    ++buf;
    EXPECT_CHAR('\012');
  }
  else if (*buf == '\012') {
    ++buf;
  }

  /* parse request line */
  ADVANCE_TOKEN(*method, *method_len);
  ++buf;
  ADVANCE_PATH(*path, *path_len, has_query);
  ++buf;
  if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
    return NULL;
  }
  if (*buf == '\015') {
    ++buf;
    EXPECT_CHAR('\012');
  }
  else if (*buf == '\012') {
    ++buf;
  }
  else {
    *ret = -1;
    return NULL;
  }

  return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret,
                       has_connection, has_close, has_upgrade);
}

inline int phr_parse_request(const char *buf_start, size_t len,
                             const char **method, size_t *method_len,
                             const char **path, size_t *path_len,
                             int *minor_version, http_header *headers,
                             size_t *num_headers, size_t last_len,
                             bool &has_connection, bool &has_close,
                             bool &has_upgrade, bool &has_query) {
  const char *buf = buf_start, *buf_end = buf_start + len;
  size_t max_headers = *num_headers;
  int r;

  *method = NULL;
  *method_len = 0;
  *path = NULL;
  *path_len = 0;
  *minor_version = -1;
  *num_headers = 0;

  /* if last_len != 0, check if the request is complete (a fast countermeasure
  againt slowloris */
  if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
    return r;
  }

  if ((buf = parse_request(buf + last_len, buf_end, method, method_len, path,
                           path_len, minor_version, headers, num_headers,
                           max_headers, &r, has_connection, has_close,
                           has_upgrade, has_query)) == NULL) {
    return r;
  }

  return (int)(buf - buf_start - last_len);
}

inline const char *parse_response(const char *buf, const char *buf_end,
                                  int *minor_version, int *status,
                                  const char **msg, size_t *msg_len,
                                  http_header *headers, size_t *num_headers,
                                  size_t max_headers, int *ret) {
  /* parse "HTTP/1.x" */
  if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
    return NULL;
  }
  /* skip space */
  if (*buf++ != ' ') {
    *ret = -1;
    return NULL;
  }
  /* parse status code, we want at least [:digit:][:digit:][:digit:]<other char>
   * to try to parse */
  if (buf_end - buf < 4) {
    *ret = -2;
    return NULL;
  }
  PARSE_INT_3(status);

  /* skip space */
  if (*buf++ != ' ') {
    *ret = -1;
    return NULL;
  }
  /* get message */
  if ((buf = get_token_to_eol(buf, buf_end, msg, msg_len, ret)) == NULL) {
    return NULL;
  }

  bool has_connection, has_close, has_upgrade;

  return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret,
                       has_connection, has_close, has_upgrade);
}

inline int phr_parse_response(const char *buf_start, size_t len,
                              int *minor_version, int *status, const char **msg,
                              size_t *msg_len, http_header *headers,
                              size_t *num_headers, size_t last_len) {
  const char *buf = buf_start, *buf_end = buf + len;
  size_t max_headers = *num_headers;
  int r;

  *minor_version = -1;
  *status = 0;
  *msg = NULL;
  *msg_len = 0;
  *num_headers = 0;

  /* if last_len != 0, check if the response is complete (a fast countermeasure
  against slowloris */
  if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
    return r;
  }

  if ((buf = parse_response(buf, buf_end, minor_version, status, msg, msg_len,
                            headers, num_headers, max_headers, &r)) == NULL) {
    return r;
  }

  return (int)(buf - buf_start);
}

inline int phr_parse_headers(const char *buf_start, size_t len,
                             http_header *headers, size_t *num_headers,
                             size_t last_len) {
  const char *buf = buf_start, *buf_end = buf + len;
  size_t max_headers = *num_headers;
  int r;

  *num_headers = 0;

  /* if last_len != 0, check if the response is complete (a fast countermeasure
  against slowloris */
  if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
    return r;
  }

  bool has_connection, has_close, has_upgrade;
  if ((buf = parse_headers(buf, buf_end, headers, num_headers, max_headers, &r,
                           has_connection, has_close, has_upgrade)) == NULL) {
    return r;
  }

  return (int)(buf - buf_start);
}

enum {
  CHUNKED_IN_CHUNK_SIZE,
  CHUNKED_IN_CHUNK_EXT,
  CHUNKED_IN_CHUNK_DATA,
  CHUNKED_IN_CHUNK_CRLF,
  CHUNKED_IN_TRAILERS_LINE_HEAD,
  CHUNKED_IN_TRAILERS_LINE_MIDDLE
};

static int decode_hex(int ch) {
  if ('0' <= ch && ch <= '9') {
    return ch - '0';
  }
  else if ('A' <= ch && ch <= 'F') {
    return ch - 'A' + 0xa;
  }
  else if ('a' <= ch && ch <= 'f') {
    return ch - 'a' + 0xa;
  }
  else {
    return -1;
  }
}

inline ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder,
                                  char *buf, size_t *_bufsz) {
  size_t dst = 0, src = 0, bufsz = *_bufsz;
  ssize_t ret = -2; /* incomplete */

  while (1) {
    switch (decoder->_state) {
      case CHUNKED_IN_CHUNK_SIZE:
        for (;; ++src) {
          int v;
          if (src == bufsz)
            goto Exit;
          if ((v = decode_hex(buf[src])) == -1) {
            if (decoder->_hex_count == 0) {
              ret = -1;
              goto Exit;
            }
            break;
          }
          if (decoder->_hex_count == sizeof(size_t) * 2) {
            ret = -1;
            goto Exit;
          }
          decoder->bytes_left_in_chunk = decoder->bytes_left_in_chunk * 16 + v;
          ++decoder->_hex_count;
        }
        decoder->_hex_count = 0;
        decoder->_state = CHUNKED_IN_CHUNK_EXT;
        /* fallthru */
      case CHUNKED_IN_CHUNK_EXT:
        /* RFC 7230 A.2 "Line folding in chunk extensions is disallowed" */
        for (;; ++src) {
          if (src == bufsz)
            goto Exit;
          if (buf[src] == '\012')
            break;
        }
        ++src;
        if (decoder->bytes_left_in_chunk == 0) {
          if (decoder->consume_trailer) {
            decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD;
            break;
          }
          else {
            goto Complete;
          }
        }
        decoder->_state = CHUNKED_IN_CHUNK_DATA;
        /* fallthru */
      case CHUNKED_IN_CHUNK_DATA: {
        size_t avail = bufsz - src;
        if (avail < decoder->bytes_left_in_chunk) {
          if (dst != src)
            memmove(buf + dst, buf + src, avail);
          src += avail;
          dst += avail;
          decoder->bytes_left_in_chunk -= avail;
          goto Exit;
        }
        if (dst != src)
          memmove(buf + dst, buf + src, decoder->bytes_left_in_chunk);
        src += decoder->bytes_left_in_chunk;
        dst += decoder->bytes_left_in_chunk;
        decoder->bytes_left_in_chunk = 0;
        decoder->_state = CHUNKED_IN_CHUNK_CRLF;
      }
        /* fallthru */
      case CHUNKED_IN_CHUNK_CRLF:
        for (;; ++src) {
          if (src == bufsz)
            goto Exit;
          if (buf[src] != '\015')
            break;
        }
        if (buf[src] != '\012') {
          ret = -1;
          goto Exit;
        }
        ++src;
        decoder->_state = CHUNKED_IN_CHUNK_SIZE;
        break;
      case CHUNKED_IN_TRAILERS_LINE_HEAD:
        for (;; ++src) {
          if (src == bufsz)
            goto Exit;
          if (buf[src] != '\015')
            break;
        }
        if (buf[src++] == '\012')
          goto Complete;
        decoder->_state = CHUNKED_IN_TRAILERS_LINE_MIDDLE;
        /* fallthru */
      case CHUNKED_IN_TRAILERS_LINE_MIDDLE:
        for (;; ++src) {
          if (src == bufsz)
            goto Exit;
          if (buf[src] == '\012')
            break;
        }
        ++src;
        decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD;
        break;
      default:
        assert(!"decoder is corrupt");
    }
  }

Complete:
  ret = bufsz - src;
Exit:
  if (dst != src)
    memmove(buf + dst, buf + src, bufsz - src);
  *_bufsz = dst;
  return ret;
}

inline int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder) {
  return decoder->_state == CHUNKED_IN_CHUNK_DATA;
}
}  // namespace detail
}  // namespace cinatra
#undef CHECK_EOF
#undef EXPECT_CHAR
#undef ADVANCE_TOKEN

#endif