feat: indie status page MVP -- FastAPI + SQLite
- 8 DB models (services, incidents, monitors, subscribers, etc.) - Full CRUD API for services, incidents, monitors - Public status page with live data - Incident detail page with timeline - API key authentication - Uptime monitoring scheduler - 13 tests passing - TECHNICAL_DESIGN.md with full spec
This commit is contained in:
commit
902133edd3
4655 changed files with 1342691 additions and 0 deletions
|
|
@ -0,0 +1,76 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_AVX
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
|
||||
#else
|
||||
#pragma GCC target("avx")
|
||||
#endif
|
||||
#include <immintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_AVX_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_AVX_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_AVX_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "../ssse3/dec_reshuffle.c"
|
||||
#include "../ssse3/dec_loop.c"
|
||||
|
||||
#if BASE64_AVX_USE_ASM
|
||||
# include "./enc_loop_asm.c"
|
||||
#else
|
||||
# include "../ssse3/enc_translate.c"
|
||||
# include "../ssse3/enc_reshuffle.c"
|
||||
# include "../ssse3/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_AVX
|
||||
|
||||
void
|
||||
base64_stream_encode_avx BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_AVX
|
||||
#include "../generic/enc_head.c"
|
||||
|
||||
// For supported compilers, use a hand-optimized inline assembly
|
||||
// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
|
||||
// AVX flags to generate better optimized AVX code.
|
||||
|
||||
#if BASE64_AVX_USE_ASM
|
||||
enc_loop_avx(&s, &slen, &o, &olen);
|
||||
#else
|
||||
enc_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#endif
|
||||
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_avx BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_AVX
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#endif
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,264 @@
|
|||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads register R0 from memory. The
|
||||
// offset at which the register is loaded is set by the given round.
|
||||
#define LOAD(R0, ROUND) \
|
||||
"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that deinterleaves and shuffles register
|
||||
// R0 using preloaded constants. Outputs in R0 and R1.
|
||||
#define SHUF(R0, R1, R2) \
|
||||
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
|
||||
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
|
||||
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
|
||||
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
|
||||
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
|
||||
"vpor %["R1"], %["R2"], %["R1"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes R0 and R1 and translates
|
||||
// their contents to the base64 alphabet, using preloaded constants.
|
||||
#define TRAN(R0, R1, R2) \
|
||||
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
|
||||
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
|
||||
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
|
||||
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
|
||||
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that stores the given register R0 at an
|
||||
// offset set by the given round.
|
||||
#define STOR(R0, ROUND) \
|
||||
"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result. Then update
|
||||
// the source and destination pointers.
|
||||
#define ROUND() \
|
||||
LOAD("a", 0) \
|
||||
SHUF("a", "b", "c") \
|
||||
TRAN("a", "b", "c") \
|
||||
STOR("a", 0) \
|
||||
"add $12, %[src] \n\t" \
|
||||
"add $16, %[dst] \n\t"
|
||||
|
||||
// Define a macro that initiates a three-way interleaved encoding round by
|
||||
// preloading registers a, b and c from memory.
|
||||
// The register graph shows which registers are in use during each step, and
|
||||
// is a visual aid for choosing registers for that step. Symbol index:
|
||||
//
|
||||
// + indicates that a register is loaded by that step.
|
||||
// | indicates that a register is in use and must not be touched.
|
||||
// - indicates that a register is decommissioned by that step.
|
||||
// x indicates that a register is used as a temporary by that step.
|
||||
// V indicates that a register is an input or output to the macro.
|
||||
//
|
||||
#define ROUND_3_INIT() /* a b c d e f */ \
|
||||
LOAD("a", 0) /* + */ \
|
||||
SHUF("a", "d", "e") /* | + x */ \
|
||||
LOAD("b", 1) /* | + | */ \
|
||||
TRAN("a", "d", "e") /* | | - x */ \
|
||||
LOAD("c", 2) /* V V V */
|
||||
|
||||
// Define a macro that translates, shuffles and stores the input registers A, B
|
||||
// and C, and preloads registers D, E and F for the next round.
|
||||
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
|
||||
// and F back into the next round as input registers A, B and C. The macro
|
||||
// carefully interleaves memory operations with data operations for optimal
|
||||
// pipelined performance.
|
||||
|
||||
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
LOAD(D, (ROUND + 3)) /* V V V + */ \
|
||||
SHUF(B, E, F) /* | | | | + x */ \
|
||||
STOR(A, (ROUND + 0)) /* - | | | | */ \
|
||||
TRAN(B, E, F) /* | | | - x */ \
|
||||
LOAD(E, (ROUND + 4)) /* | | | + */ \
|
||||
SHUF(C, A, F) /* + | | | | x */ \
|
||||
STOR(B, (ROUND + 1)) /* | - | | | */ \
|
||||
TRAN(C, A, F) /* - | | | x */ \
|
||||
LOAD(F, (ROUND + 5)) /* | | | + */ \
|
||||
SHUF(D, A, B) /* + x | | | | */ \
|
||||
STOR(C, (ROUND + 2)) /* | - | | | */ \
|
||||
TRAN(D, A, B) /* - x V V V */
|
||||
|
||||
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
|
||||
// registers D, E and F, and translating, shuffling and storing them.
|
||||
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
SHUF(E, A, B) /* + x V V V */ \
|
||||
STOR(D, (ROUND + 3)) /* | - | | */ \
|
||||
TRAN(E, A, B) /* - x | | */ \
|
||||
SHUF(F, C, D) /* + x | | */ \
|
||||
STOR(E, (ROUND + 4)) /* | - | */ \
|
||||
TRAN(F, C, D) /* - x | */ \
|
||||
STOR(F, (ROUND + 5)) /* - */
|
||||
|
||||
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
|
||||
#define ROUND_3_A(ROUND) \
|
||||
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Define a type B round. Inputs and outputs are swapped with regard to type A.
|
||||
#define ROUND_3_B(ROUND) \
|
||||
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Terminating macro for a type A round.
|
||||
#define ROUND_3_A_LAST(ROUND) \
|
||||
ROUND_3_A(ROUND) \
|
||||
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Terminating macro for a type B round.
|
||||
#define ROUND_3_B_LAST(ROUND) \
|
||||
ROUND_3_B(ROUND) \
|
||||
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
// For a clearer explanation of the algorithm used by this function,
|
||||
// please refer to the plain (not inline assembly) implementation. This
|
||||
// function follows the same basic logic.
|
||||
|
||||
if (*slen < 16) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
|
||||
// bytes, so "reserve" four bytes from the input buffer to ensure that
|
||||
// we never read beyond the end of the input buffer.
|
||||
size_t rounds = (*slen - 4) / 12;
|
||||
|
||||
*slen -= rounds * 12; // 12 bytes consumed per round
|
||||
*olen += rounds * 16; // 16 bytes produced per round
|
||||
|
||||
// Number of times to go through the 36x loop.
|
||||
size_t loops = rounds / 36;
|
||||
|
||||
// Number of rounds remaining after the 36x loop.
|
||||
rounds %= 36;
|
||||
|
||||
// Lookup tables.
|
||||
const __m128i lut0 = _mm_set_epi8(
|
||||
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
|
||||
|
||||
const __m128i lut1 = _mm_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Temporary registers.
|
||||
__m128i a, b, c, d, e, f;
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// If there are 36 rounds or more, enter a 36x unrolled loop of
|
||||
// interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations (table lookups,
|
||||
// etc) to maximize pipeline throughput.
|
||||
" test %[loops], %[loops] \n\t"
|
||||
" jz 18f \n\t"
|
||||
" jmp 36f \n\t"
|
||||
" \n\t"
|
||||
".balign 64 \n\t"
|
||||
"36: " ROUND_3_INIT()
|
||||
" " ROUND_3_A( 0)
|
||||
" " ROUND_3_B( 3)
|
||||
" " ROUND_3_A( 6)
|
||||
" " ROUND_3_B( 9)
|
||||
" " ROUND_3_A(12)
|
||||
" " ROUND_3_B(15)
|
||||
" " ROUND_3_A(18)
|
||||
" " ROUND_3_B(21)
|
||||
" " ROUND_3_A(24)
|
||||
" " ROUND_3_B(27)
|
||||
" " ROUND_3_A_LAST(30)
|
||||
" add $(12 * 36), %[src] \n\t"
|
||||
" add $(16 * 36), %[dst] \n\t"
|
||||
" dec %[loops] \n\t"
|
||||
" jnz 36b \n\t"
|
||||
|
||||
// Enter an 18x unrolled loop for rounds of 18 or more.
|
||||
"18: cmp $18, %[rounds] \n\t"
|
||||
" jl 9f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B(3)
|
||||
" " ROUND_3_A(6)
|
||||
" " ROUND_3_B(9)
|
||||
" " ROUND_3_A_LAST(12)
|
||||
" sub $18, %[rounds] \n\t"
|
||||
" add $(12 * 18), %[src] \n\t"
|
||||
" add $(16 * 18), %[dst] \n\t"
|
||||
|
||||
// Enter a 9x unrolled loop for rounds of 9 or more.
|
||||
"9: cmp $9, %[rounds] \n\t"
|
||||
" jl 6f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B_LAST(3)
|
||||
" sub $9, %[rounds] \n\t"
|
||||
" add $(12 * 9), %[src] \n\t"
|
||||
" add $(16 * 9), %[dst] \n\t"
|
||||
|
||||
// Enter a 6x unrolled loop for rounds of 6 or more.
|
||||
"6: cmp $6, %[rounds] \n\t"
|
||||
" jl 55f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A_LAST(0)
|
||||
" sub $6, %[rounds] \n\t"
|
||||
" add $(12 * 6), %[src] \n\t"
|
||||
" add $(16 * 6), %[dst] \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..5.
|
||||
"55: cmp $3, %[rounds] \n\t"
|
||||
" jg 45f \n\t"
|
||||
" je 3f \n\t"
|
||||
" cmp $1, %[rounds] \n\t"
|
||||
" jg 2f \n\t"
|
||||
" je 1f \n\t"
|
||||
" jmp 0f \n\t"
|
||||
|
||||
"45: cmp $4, %[rounds] \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"5: " ROUND()
|
||||
"4: " ROUND()
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [rounds] "+r" (rounds),
|
||||
[loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[a] "=&x" (a),
|
||||
[b] "=&x" (b),
|
||||
[c] "=&x" (c),
|
||||
[d] "=&x" (d),
|
||||
[e] "=&x" (e),
|
||||
[f] "=&x" (f)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut0] "x" (lut0),
|
||||
[lut1] "x" (lut1),
|
||||
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
|
||||
[msk1] "x" (_mm_set1_epi32(0x04000040)),
|
||||
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
|
||||
[msk3] "x" (_mm_set1_epi32(0x01000010)),
|
||||
[n51] "x" (_mm_set1_epi8(51)),
|
||||
[n25] "x" (_mm_set1_epi8(25))
|
||||
|
||||
// Clobbers.
|
||||
: "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_AVX2
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
|
||||
#else
|
||||
#pragma GCC target("avx2")
|
||||
#endif
|
||||
#include <immintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_AVX2_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_AVX2_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_AVX2_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "./dec_reshuffle.c"
|
||||
#include "./dec_loop.c"
|
||||
|
||||
#if BASE64_AVX2_USE_ASM
|
||||
# include "./enc_loop_asm.c"
|
||||
#else
|
||||
# include "./enc_translate.c"
|
||||
# include "./enc_reshuffle.c"
|
||||
# include "./enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
void
|
||||
base64_stream_encode_avx2 BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_AVX2
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_avx2(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_avx2 BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_AVX2
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_avx2(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#endif
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
static BASE64_FORCE_INLINE int
|
||||
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
||||
{
|
||||
const __m256i lut_lo = _mm256_setr_epi8(
|
||||
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
||||
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
|
||||
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
||||
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
|
||||
|
||||
const __m256i lut_hi = _mm256_setr_epi8(
|
||||
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
||||
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
|
||||
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
||||
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
|
||||
|
||||
const __m256i lut_roll = _mm256_setr_epi8(
|
||||
0, 16, 19, 4, -65, -65, -71, -71,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 16, 19, 4, -65, -65, -71, -71,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
const __m256i mask_2F = _mm256_set1_epi8(0x2F);
|
||||
|
||||
// Load input:
|
||||
__m256i str = _mm256_loadu_si256((__m256i *) *s);
|
||||
|
||||
// See the SSSE3 decoder for an explanation of the algorithm.
|
||||
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
|
||||
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
|
||||
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
|
||||
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
|
||||
|
||||
if (!_mm256_testz_si256(lo, hi)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
|
||||
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
|
||||
|
||||
// Now simply add the delta values to the input:
|
||||
str = _mm256_add_epi8(str, roll);
|
||||
|
||||
// Reshuffle the input to packed 12-byte output format:
|
||||
str = dec_reshuffle(str);
|
||||
|
||||
// Store the output:
|
||||
_mm256_storeu_si256((__m256i *) *o, str);
|
||||
|
||||
*s += 32;
|
||||
*o += 24;
|
||||
*rounds -= 1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 45) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
|
||||
// written after the output, ensure that there will be at least 13
|
||||
// bytes of input data left to cover the gap. (11 data bytes and up to
|
||||
// two end-of-string markers.)
|
||||
size_t rounds = (*slen - 13) / 32;
|
||||
|
||||
*slen -= rounds * 32; // 32 bytes consumed per round
|
||||
*olen += rounds * 24; // 24 bytes produced per round
|
||||
|
||||
do {
|
||||
if (rounds >= 8) {
|
||||
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
||||
dec_loop_avx2_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
dec_loop_avx2_inner(s, o, &rounds);
|
||||
break;
|
||||
|
||||
} while (rounds > 0);
|
||||
|
||||
// Adjust for any rounds that were skipped:
|
||||
*slen += rounds * 32;
|
||||
*olen -= rounds * 24;
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
static BASE64_FORCE_INLINE __m256i
|
||||
dec_reshuffle (const __m256i in)
|
||||
{
|
||||
// in, lower lane, bits, upper case are most significant bits, lower
|
||||
// case are least significant bits:
|
||||
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
||||
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
||||
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
||||
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
||||
|
||||
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
|
||||
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
|
||||
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
|
||||
// 0000eeee FFffffff 0000DDDD DDddEEEE
|
||||
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
|
||||
|
||||
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
|
||||
// 00000000 JJJJJJjj KKKKkkkk LLllllll
|
||||
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
|
||||
// 00000000 DDDDDDdd EEEEeeee FFffffff
|
||||
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
|
||||
|
||||
// Pack bytes together in each lane:
|
||||
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
|
||||
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
|
||||
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
|
||||
// 00000000 00000000 00000000 00000000
|
||||
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
|
||||
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
|
||||
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
|
||||
|
||||
// Pack lanes:
|
||||
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
// First load is done at s - 0 to not get a segfault:
|
||||
__m256i src = _mm256_loadu_si256((__m256i *) *s);
|
||||
|
||||
// Shift by 4 bytes, as required by enc_reshuffle:
|
||||
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
|
||||
|
||||
// Reshuffle, translate, store:
|
||||
src = enc_reshuffle(src);
|
||||
src = enc_translate(src);
|
||||
_mm256_storeu_si256((__m256i *) *o, src);
|
||||
|
||||
// Subsequent loads will be done at s - 4, set pointer for next round:
|
||||
*s += 20;
|
||||
*o += 32;
|
||||
}
|
||||
|
||||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
// Load input:
|
||||
__m256i src = _mm256_loadu_si256((__m256i *) *s);
|
||||
|
||||
// Reshuffle, translate, store:
|
||||
src = enc_reshuffle(src);
|
||||
src = enc_translate(src);
|
||||
_mm256_storeu_si256((__m256i *) *o, src);
|
||||
|
||||
*s += 24;
|
||||
*o += 32;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
|
||||
// bytes at a time an offset of -4, ensure that there will be at least
|
||||
// 4 remaining bytes after the last round, so that the final read will
|
||||
// not pass beyond the bounds of the input buffer:
|
||||
size_t rounds = (*slen - 4) / 24;
|
||||
|
||||
*slen -= rounds * 24; // 24 bytes consumed per round
|
||||
*olen += rounds * 32; // 32 bytes produced per round
|
||||
|
||||
// The first loop iteration requires special handling to ensure that
|
||||
// the read, which is done at an offset, does not underflow the buffer:
|
||||
enc_loop_avx2_inner_first(s, o);
|
||||
rounds--;
|
||||
|
||||
while (rounds > 0) {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_avx2_inner(s, o);
|
||||
enc_loop_avx2_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_avx2_inner(s, o);
|
||||
break;
|
||||
}
|
||||
|
||||
// Add the offset back:
|
||||
*s += 4;
|
||||
}
|
||||
|
|
@ -0,0 +1,291 @@
|
|||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads register R0 from memory. The
|
||||
// offset at which the register is loaded is set by the given round and a
|
||||
// constant offset.
|
||||
#define LOAD(R0, ROUND, OFFSET) \
|
||||
"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that deinterleaves and shuffles register
|
||||
// R0 using preloaded constants. Outputs in R0 and R1.
|
||||
#define SHUF(R0, R1, R2) \
|
||||
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
|
||||
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
|
||||
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
|
||||
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
|
||||
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
|
||||
"vpor %["R1"], %["R2"], %["R1"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes R0 and R1 and translates
|
||||
// their contents to the base64 alphabet, using preloaded constants.
|
||||
#define TRAN(R0, R1, R2) \
|
||||
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
|
||||
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
|
||||
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
|
||||
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
|
||||
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that stores the given register R0 at an
|
||||
// offset set by the given round.
|
||||
#define STOR(R0, ROUND) \
|
||||
"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result. Then update
|
||||
// the source and destination pointers.
|
||||
#define ROUND() \
|
||||
LOAD("a", 0, -4) \
|
||||
SHUF("a", "b", "c") \
|
||||
TRAN("a", "b", "c") \
|
||||
STOR("a", 0) \
|
||||
"add $24, %[src] \n\t" \
|
||||
"add $32, %[dst] \n\t"
|
||||
|
||||
// Define a macro that initiates a three-way interleaved encoding round by
|
||||
// preloading registers a, b and c from memory.
|
||||
// The register graph shows which registers are in use during each step, and
|
||||
// is a visual aid for choosing registers for that step. Symbol index:
|
||||
//
|
||||
// + indicates that a register is loaded by that step.
|
||||
// | indicates that a register is in use and must not be touched.
|
||||
// - indicates that a register is decommissioned by that step.
|
||||
// x indicates that a register is used as a temporary by that step.
|
||||
// V indicates that a register is an input or output to the macro.
|
||||
//
|
||||
#define ROUND_3_INIT() /* a b c d e f */ \
|
||||
LOAD("a", 0, -4) /* + */ \
|
||||
SHUF("a", "d", "e") /* | + x */ \
|
||||
LOAD("b", 1, -4) /* | + | */ \
|
||||
TRAN("a", "d", "e") /* | | - x */ \
|
||||
LOAD("c", 2, -4) /* V V V */
|
||||
|
||||
// Define a macro that translates, shuffles and stores the input registers A, B
|
||||
// and C, and preloads registers D, E and F for the next round.
|
||||
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
|
||||
// and F back into the next round as input registers A, B and C. The macro
|
||||
// carefully interleaves memory operations with data operations for optimal
|
||||
// pipelined performance.
|
||||
|
||||
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
LOAD(D, (ROUND + 3), -4) /* V V V + */ \
|
||||
SHUF(B, E, F) /* | | | | + x */ \
|
||||
STOR(A, (ROUND + 0)) /* - | | | | */ \
|
||||
TRAN(B, E, F) /* | | | - x */ \
|
||||
LOAD(E, (ROUND + 4), -4) /* | | | + */ \
|
||||
SHUF(C, A, F) /* + | | | | x */ \
|
||||
STOR(B, (ROUND + 1)) /* | - | | | */ \
|
||||
TRAN(C, A, F) /* - | | | x */ \
|
||||
LOAD(F, (ROUND + 5), -4) /* | | | + */ \
|
||||
SHUF(D, A, B) /* + x | | | | */ \
|
||||
STOR(C, (ROUND + 2)) /* | - | | | */ \
|
||||
TRAN(D, A, B) /* - x V V V */
|
||||
|
||||
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
|
||||
// registers D, E and F, and translating, shuffling and storing them.
|
||||
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
SHUF(E, A, B) /* + x V V V */ \
|
||||
STOR(D, (ROUND + 3)) /* | - | | */ \
|
||||
TRAN(E, A, B) /* - x | | */ \
|
||||
SHUF(F, C, D) /* + x | | */ \
|
||||
STOR(E, (ROUND + 4)) /* | - | */ \
|
||||
TRAN(F, C, D) /* - x | */ \
|
||||
STOR(F, (ROUND + 5)) /* - */
|
||||
|
||||
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
|
||||
#define ROUND_3_A(ROUND) \
|
||||
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Define a type B round. Inputs and outputs are swapped with regard to type A.
|
||||
#define ROUND_3_B(ROUND) \
|
||||
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Terminating macro for a type A round.
|
||||
#define ROUND_3_A_LAST(ROUND) \
|
||||
ROUND_3_A(ROUND) \
|
||||
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Terminating macro for a type B round.
|
||||
#define ROUND_3_B_LAST(ROUND) \
|
||||
ROUND_3_B(ROUND) \
|
||||
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
// For a clearer explanation of the algorithm used by this function,
|
||||
// please refer to the plain (not inline assembly) implementation. This
|
||||
// function follows the same basic logic.
|
||||
|
||||
if (*slen < 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
|
||||
// bytes at a time an offset of -4, ensure that there will be at least
|
||||
// 4 remaining bytes after the last round, so that the final read will
|
||||
// not pass beyond the bounds of the input buffer.
|
||||
size_t rounds = (*slen - 4) / 24;
|
||||
|
||||
*slen -= rounds * 24; // 24 bytes consumed per round
|
||||
*olen += rounds * 32; // 32 bytes produced per round
|
||||
|
||||
// Pre-decrement the number of rounds to get the number of rounds
|
||||
// *after* the first round, which is handled as a special case.
|
||||
rounds--;
|
||||
|
||||
// Number of times to go through the 36x loop.
|
||||
size_t loops = rounds / 36;
|
||||
|
||||
// Number of rounds remaining after the 36x loop.
|
||||
rounds %= 36;
|
||||
|
||||
// Lookup tables.
|
||||
const __m256i lut0 = _mm256_set_epi8(
|
||||
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
|
||||
14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5);
|
||||
|
||||
const __m256i lut1 = _mm256_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Temporary registers.
|
||||
__m256i a, b, c, d, e;
|
||||
|
||||
// Temporary register f doubles as the shift mask for the first round.
|
||||
__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// The first loop iteration requires special handling to ensure
|
||||
// that the read, which is normally done at an offset of -4,
|
||||
// does not underflow the buffer. Load the buffer at an offset
|
||||
// of 0 and permute the input to achieve the same effect.
|
||||
LOAD("a", 0, 0)
|
||||
"vpermd %[a], %[f], %[a] \n\t"
|
||||
|
||||
// Perform the standard shuffling and translation steps.
|
||||
SHUF("a", "b", "c")
|
||||
TRAN("a", "b", "c")
|
||||
|
||||
// Store the result and increment the source and dest pointers.
|
||||
"vmovdqu %[a], (%[dst]) \n\t"
|
||||
"add $24, %[src] \n\t"
|
||||
"add $32, %[dst] \n\t"
|
||||
|
||||
// If there are 36 rounds or more, enter a 36x unrolled loop of
|
||||
// interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations (table lookups,
|
||||
// etc) to maximize pipeline throughput.
|
||||
" test %[loops], %[loops] \n\t"
|
||||
" jz 18f \n\t"
|
||||
" jmp 36f \n\t"
|
||||
" \n\t"
|
||||
".balign 64 \n\t"
|
||||
"36: " ROUND_3_INIT()
|
||||
" " ROUND_3_A( 0)
|
||||
" " ROUND_3_B( 3)
|
||||
" " ROUND_3_A( 6)
|
||||
" " ROUND_3_B( 9)
|
||||
" " ROUND_3_A(12)
|
||||
" " ROUND_3_B(15)
|
||||
" " ROUND_3_A(18)
|
||||
" " ROUND_3_B(21)
|
||||
" " ROUND_3_A(24)
|
||||
" " ROUND_3_B(27)
|
||||
" " ROUND_3_A_LAST(30)
|
||||
" add $(24 * 36), %[src] \n\t"
|
||||
" add $(32 * 36), %[dst] \n\t"
|
||||
" dec %[loops] \n\t"
|
||||
" jnz 36b \n\t"
|
||||
|
||||
// Enter an 18x unrolled loop for rounds of 18 or more.
|
||||
"18: cmp $18, %[rounds] \n\t"
|
||||
" jl 9f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B(3)
|
||||
" " ROUND_3_A(6)
|
||||
" " ROUND_3_B(9)
|
||||
" " ROUND_3_A_LAST(12)
|
||||
" sub $18, %[rounds] \n\t"
|
||||
" add $(24 * 18), %[src] \n\t"
|
||||
" add $(32 * 18), %[dst] \n\t"
|
||||
|
||||
// Enter a 9x unrolled loop for rounds of 9 or more.
|
||||
"9: cmp $9, %[rounds] \n\t"
|
||||
" jl 6f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B_LAST(3)
|
||||
" sub $9, %[rounds] \n\t"
|
||||
" add $(24 * 9), %[src] \n\t"
|
||||
" add $(32 * 9), %[dst] \n\t"
|
||||
|
||||
// Enter a 6x unrolled loop for rounds of 6 or more.
|
||||
"6: cmp $6, %[rounds] \n\t"
|
||||
" jl 55f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A_LAST(0)
|
||||
" sub $6, %[rounds] \n\t"
|
||||
" add $(24 * 6), %[src] \n\t"
|
||||
" add $(32 * 6), %[dst] \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..5.
|
||||
"55: cmp $3, %[rounds] \n\t"
|
||||
" jg 45f \n\t"
|
||||
" je 3f \n\t"
|
||||
" cmp $1, %[rounds] \n\t"
|
||||
" jg 2f \n\t"
|
||||
" je 1f \n\t"
|
||||
" jmp 0f \n\t"
|
||||
|
||||
"45: cmp $4, %[rounds] \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"5: " ROUND()
|
||||
"4: " ROUND()
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [rounds] "+r" (rounds),
|
||||
[loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[a] "=&x" (a),
|
||||
[b] "=&x" (b),
|
||||
[c] "=&x" (c),
|
||||
[d] "=&x" (d),
|
||||
[e] "=&x" (e),
|
||||
[f] "+x" (f)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut0] "x" (lut0),
|
||||
[lut1] "x" (lut1),
|
||||
[msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
|
||||
[msk1] "x" (_mm256_set1_epi32(0x04000040)),
|
||||
[msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
|
||||
[msk3] "x" (_mm256_set1_epi32(0x01000010)),
|
||||
[n51] "x" (_mm256_set1_epi8(51)),
|
||||
[n25] "x" (_mm256_set1_epi8(25))
|
||||
|
||||
// Clobbers.
|
||||
: "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
static BASE64_FORCE_INLINE __m256i
|
||||
enc_reshuffle (const __m256i input)
|
||||
{
|
||||
// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
|
||||
// works with shifted (4 bytes) input in order to be able to work
|
||||
// efficiently in the two 128-bit lanes.
|
||||
|
||||
// Input, bytes MSB to LSB:
|
||||
// 0 0 0 0 x w v u t s r q p o n m
|
||||
// l k j i h g f e d c b a 0 0 0 0
|
||||
|
||||
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
|
||||
10, 11, 9, 10,
|
||||
7, 8, 6, 7,
|
||||
4, 5, 3, 4,
|
||||
1, 2, 0, 1,
|
||||
|
||||
14, 15, 13, 14,
|
||||
11, 12, 10, 11,
|
||||
8, 9, 7, 8,
|
||||
5, 6, 4, 5));
|
||||
// in, bytes MSB to LSB:
|
||||
// w x v w
|
||||
// t u s t
|
||||
// q r p q
|
||||
// n o m n
|
||||
// k l j k
|
||||
// h i g h
|
||||
// e f d e
|
||||
// b c a b
|
||||
|
||||
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
|
||||
// bits, upper case are most significant bits, lower case are least
|
||||
// significant bits.
|
||||
// 0000wwww XX000000 VVVVVV00 00000000
|
||||
// 0000tttt UU000000 SSSSSS00 00000000
|
||||
// 0000qqqq RR000000 PPPPPP00 00000000
|
||||
// 0000nnnn OO000000 MMMMMM00 00000000
|
||||
// 0000kkkk LL000000 JJJJJJ00 00000000
|
||||
// 0000hhhh II000000 GGGGGG00 00000000
|
||||
// 0000eeee FF000000 DDDDDD00 00000000
|
||||
// 0000bbbb CC000000 AAAAAA00 00000000
|
||||
|
||||
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
|
||||
// 00000000 00wwwwXX 00000000 00VVVVVV
|
||||
// 00000000 00ttttUU 00000000 00SSSSSS
|
||||
// 00000000 00qqqqRR 00000000 00PPPPPP
|
||||
// 00000000 00nnnnOO 00000000 00MMMMMM
|
||||
// 00000000 00kkkkLL 00000000 00JJJJJJ
|
||||
// 00000000 00hhhhII 00000000 00GGGGGG
|
||||
// 00000000 00eeeeFF 00000000 00DDDDDD
|
||||
// 00000000 00bbbbCC 00000000 00AAAAAA
|
||||
|
||||
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
|
||||
// 00000000 00xxxxxx 000000vv WWWW0000
|
||||
// 00000000 00uuuuuu 000000ss TTTT0000
|
||||
// 00000000 00rrrrrr 000000pp QQQQ0000
|
||||
// 00000000 00oooooo 000000mm NNNN0000
|
||||
// 00000000 00llllll 000000jj KKKK0000
|
||||
// 00000000 00iiiiii 000000gg HHHH0000
|
||||
// 00000000 00ffffff 000000dd EEEE0000
|
||||
// 00000000 00cccccc 000000aa BBBB0000
|
||||
|
||||
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
|
||||
// 00xxxxxx 00000000 00vvWWWW 00000000
|
||||
// 00uuuuuu 00000000 00ssTTTT 00000000
|
||||
// 00rrrrrr 00000000 00ppQQQQ 00000000
|
||||
// 00oooooo 00000000 00mmNNNN 00000000
|
||||
// 00llllll 00000000 00jjKKKK 00000000
|
||||
// 00iiiiii 00000000 00ggHHHH 00000000
|
||||
// 00ffffff 00000000 00ddEEEE 00000000
|
||||
// 00cccccc 00000000 00aaBBBB 00000000
|
||||
|
||||
return _mm256_or_si256(t1, t3);
|
||||
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
|
||||
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
|
||||
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
|
||||
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
|
||||
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
||||
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
||||
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
||||
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
static BASE64_FORCE_INLINE __m256i
|
||||
enc_translate (const __m256i in)
|
||||
{
|
||||
// A lookup table containing the absolute offsets for all ranges:
|
||||
const __m256i lut = _mm256_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
||||
// # From To Abs Index Characters
|
||||
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
||||
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
||||
// 3 [62] [43] -19 12 +
|
||||
// 4 [63] [47] -16 13 /
|
||||
|
||||
// Create LUT indices from the input. The index for range #0 is right,
|
||||
// others are 1 less than expected:
|
||||
__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
|
||||
|
||||
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
||||
const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
|
||||
|
||||
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
|
||||
// now correct:
|
||||
indices = _mm256_sub_epi8(indices, mask);
|
||||
|
||||
// Add offsets to input values:
|
||||
return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
|
||||
}
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_AVX512
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push (__attribute__((target("avx512vbmi"))), apply_to=function)
|
||||
#else
|
||||
#pragma GCC target("avx512vbmi")
|
||||
#endif
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "../avx2/dec_reshuffle.c"
|
||||
#include "../avx2/dec_loop.c"
|
||||
#include "enc_reshuffle_translate.c"
|
||||
#include "enc_loop.c"
|
||||
|
||||
#endif // HAVE_AVX512
|
||||
|
||||
void
|
||||
base64_stream_encode_avx512 BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_AVX512
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_avx512(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Reuse AVX2 decoding. Not supporting AVX512 at present
|
||||
int
|
||||
base64_stream_decode_avx512 BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_AVX512
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_avx2(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#endif
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
// Load input.
|
||||
__m512i src = _mm512_loadu_si512((__m512i *) *s);
|
||||
|
||||
// Reshuffle, translate, store.
|
||||
src = enc_reshuffle_translate(src);
|
||||
_mm512_storeu_si512((__m512i *) *o, src);
|
||||
|
||||
*s += 48;
|
||||
*o += 64;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 64) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 48 bytes at a time. Because blocks are loaded 64
|
||||
// bytes at a time, ensure that there will be at least 24 remaining
|
||||
// bytes after the last round, so that the final read will not pass
|
||||
// beyond the bounds of the input buffer.
|
||||
size_t rounds = (*slen - 24) / 48;
|
||||
|
||||
*slen -= rounds * 48; // 48 bytes consumed per round
|
||||
*olen += rounds * 64; // 64 bytes produced per round
|
||||
|
||||
while (rounds > 0) {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_avx512_inner(s, o);
|
||||
enc_loop_avx512_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_avx512_inner(s, o);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
// AVX512 algorithm is based on permutevar and multishift. The code is based on
|
||||
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
|
||||
|
||||
static BASE64_FORCE_INLINE __m512i
|
||||
enc_reshuffle_translate (const __m512i input)
|
||||
{
|
||||
// 32-bit input
|
||||
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
|
||||
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
|
||||
// output order [1, 2, 0, 1]
|
||||
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
|
||||
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
|
||||
|
||||
const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
|
||||
0x04050304,
|
||||
0x07080607,
|
||||
0x0a0b090a,
|
||||
0x0d0e0c0d,
|
||||
0x10110f10,
|
||||
0x13141213,
|
||||
0x16171516,
|
||||
0x191a1819,
|
||||
0x1c1d1b1c,
|
||||
0x1f201e1f,
|
||||
0x22232122,
|
||||
0x25262425,
|
||||
0x28292728,
|
||||
0x2b2c2a2b,
|
||||
0x2e2f2d2e);
|
||||
|
||||
// Reorder bytes
|
||||
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
|
||||
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
|
||||
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
|
||||
|
||||
// After multishift a single 32-bit lane has following layout
|
||||
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
|
||||
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
|
||||
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
|
||||
|
||||
// 48, 54, 36, 42, 16, 22, 4, 10
|
||||
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
|
||||
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
|
||||
|
||||
// Translate immediately after reshuffled.
|
||||
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
|
||||
|
||||
// Translation 6-bit values to ASCII.
|
||||
return _mm512_permutexvar_epi8(shuffled_in, lookup);
|
||||
}
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
static BASE64_FORCE_INLINE int
|
||||
dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
||||
{
|
||||
const uint32_t str
|
||||
= base64_table_dec_32bit_d0[(*s)[0]]
|
||||
| base64_table_dec_32bit_d1[(*s)[1]]
|
||||
| base64_table_dec_32bit_d2[(*s)[2]]
|
||||
| base64_table_dec_32bit_d3[(*s)[3]];
|
||||
|
||||
#if BASE64_LITTLE_ENDIAN
|
||||
|
||||
// LUTs for little-endian set MSB in case of invalid character:
|
||||
if (str & UINT32_C(0x80000000)) {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
// LUTs for big-endian set LSB in case of invalid character:
|
||||
if (str & UINT32_C(1)) {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
// Store the output:
|
||||
memcpy(*o, &str, sizeof (str));
|
||||
|
||||
*s += 4;
|
||||
*o += 3;
|
||||
*rounds -= 1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 8) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 4 bytes per round. Because one extra zero byte is
|
||||
// written after the output, ensure that there will be at least 4 bytes
|
||||
// of input data left to cover the gap. (Two data bytes and up to two
|
||||
// end-of-string markers.)
|
||||
size_t rounds = (*slen - 4) / 4;
|
||||
|
||||
*slen -= rounds * 4; // 4 bytes consumed per round
|
||||
*olen += rounds * 3; // 3 bytes produced per round
|
||||
|
||||
do {
|
||||
if (rounds >= 8) {
|
||||
if (dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
if (dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
if (dec_loop_generic_32_inner(s, o, &rounds) &&
|
||||
dec_loop_generic_32_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
dec_loop_generic_32_inner(s, o, &rounds);
|
||||
break;
|
||||
|
||||
} while (rounds > 0);
|
||||
|
||||
// Adjust for any rounds that were skipped:
|
||||
*slen += rounds * 4;
|
||||
*olen -= rounds * 3;
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
uint32_t src;
|
||||
|
||||
// Load input:
|
||||
memcpy(&src, *s, sizeof (src));
|
||||
|
||||
// Reorder to 32-bit big-endian, if not already in that format. The
|
||||
// workset must be in big-endian, otherwise the shifted bits do not
|
||||
// carry over properly among adjacent bytes:
|
||||
src = BASE64_HTOBE32(src);
|
||||
|
||||
// Two indices for the 12-bit lookup table:
|
||||
const size_t index0 = (src >> 20) & 0xFFFU;
|
||||
const size_t index1 = (src >> 8) & 0xFFFU;
|
||||
|
||||
// Table lookup and store:
|
||||
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
|
||||
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
|
||||
|
||||
*s += 3;
|
||||
*o += 4;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 4) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 3 bytes at a time. Because blocks are loaded 4
|
||||
// bytes at a time, ensure that there will be at least one remaining
|
||||
// byte after the last round, so that the final read will not pass
|
||||
// beyond the bounds of the input buffer:
|
||||
size_t rounds = (*slen - 1) / 3;
|
||||
|
||||
*slen -= rounds * 3; // 3 bytes consumed per round
|
||||
*olen += rounds * 4; // 4 bytes produced per round
|
||||
|
||||
do {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_generic_32_inner(s, o);
|
||||
break;
|
||||
|
||||
} while (rounds > 0);
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
uint64_t src;
|
||||
|
||||
// Load input:
|
||||
memcpy(&src, *s, sizeof (src));
|
||||
|
||||
// Reorder to 64-bit big-endian, if not already in that format. The
|
||||
// workset must be in big-endian, otherwise the shifted bits do not
|
||||
// carry over properly among adjacent bytes:
|
||||
src = BASE64_HTOBE64(src);
|
||||
|
||||
// Four indices for the 12-bit lookup table:
|
||||
const size_t index0 = (src >> 52) & 0xFFFU;
|
||||
const size_t index1 = (src >> 40) & 0xFFFU;
|
||||
const size_t index2 = (src >> 28) & 0xFFFU;
|
||||
const size_t index3 = (src >> 16) & 0xFFFU;
|
||||
|
||||
// Table lookup and store:
|
||||
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
|
||||
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
|
||||
memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
|
||||
memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
|
||||
|
||||
*s += 6;
|
||||
*o += 8;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 8) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 6 bytes at a time. Because blocks are loaded 8
|
||||
// bytes at a time, ensure that there will be at least 2 remaining
|
||||
// bytes after the last round, so that the final read will not pass
|
||||
// beyond the bounds of the input buffer:
|
||||
size_t rounds = (*slen - 2) / 6;
|
||||
|
||||
*slen -= rounds * 6; // 6 bytes consumed per round
|
||||
*olen += rounds * 8; // 8 bytes produced per round
|
||||
|
||||
do {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_generic_64_inner(s, o);
|
||||
break;
|
||||
|
||||
} while (rounds > 0);
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if BASE64_WORDSIZE == 32
|
||||
# include "32/enc_loop.c"
|
||||
#elif BASE64_WORDSIZE == 64
|
||||
# include "64/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#if BASE64_WORDSIZE >= 32
|
||||
# include "32/dec_loop.c"
|
||||
#endif
|
||||
|
||||
void
|
||||
base64_stream_encode_plain BASE64_ENC_PARAMS
|
||||
{
|
||||
#include "enc_head.c"
|
||||
#if BASE64_WORDSIZE == 32
|
||||
enc_loop_generic_32(&s, &slen, &o, &olen);
|
||||
#elif BASE64_WORDSIZE == 64
|
||||
enc_loop_generic_64(&s, &slen, &o, &olen);
|
||||
#endif
|
||||
#include "enc_tail.c"
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_plain BASE64_DEC_PARAMS
|
||||
{
|
||||
#include "dec_head.c"
|
||||
#if BASE64_WORDSIZE >= 32
|
||||
dec_loop_generic_32(&s, &slen, &o, &olen);
|
||||
#endif
|
||||
#include "dec_tail.c"
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
int ret = 0;
|
||||
const uint8_t *s = (const uint8_t *) src;
|
||||
uint8_t *o = (uint8_t *) out;
|
||||
uint8_t q;
|
||||
|
||||
// Use local temporaries to avoid cache thrashing:
|
||||
size_t olen = 0;
|
||||
size_t slen = srclen;
|
||||
struct base64_state st;
|
||||
st.eof = state->eof;
|
||||
st.bytes = state->bytes;
|
||||
st.carry = state->carry;
|
||||
|
||||
// If we previously saw an EOF or an invalid character, bail out:
|
||||
if (st.eof) {
|
||||
*outlen = 0;
|
||||
ret = 0;
|
||||
// If there was a trailing '=' to check, check it:
|
||||
if (slen && (st.eof == BASE64_AEOF)) {
|
||||
state->bytes = 0;
|
||||
state->eof = BASE64_EOF;
|
||||
ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Turn four 6-bit numbers into three bytes:
|
||||
// out[0] = 11111122
|
||||
// out[1] = 22223333
|
||||
// out[2] = 33444444
|
||||
|
||||
// Duff's device again:
|
||||
switch (st.bytes)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
case 0:
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
if (slen-- == 0) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
||||
st.eof = BASE64_EOF;
|
||||
// Treat character '=' as invalid for byte 0:
|
||||
break;
|
||||
}
|
||||
st.carry = q << 2;
|
||||
st.bytes++;
|
||||
|
||||
// Deliberate fallthrough:
|
||||
BASE64_FALLTHROUGH
|
||||
|
||||
case 1: if (slen-- == 0) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
||||
st.eof = BASE64_EOF;
|
||||
// Treat character '=' as invalid for byte 1:
|
||||
break;
|
||||
}
|
||||
*o++ = st.carry | (q >> 4);
|
||||
st.carry = q << 4;
|
||||
st.bytes++;
|
||||
olen++;
|
||||
|
||||
// Deliberate fallthrough:
|
||||
BASE64_FALLTHROUGH
|
||||
|
||||
case 2: if (slen-- == 0) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
||||
st.bytes++;
|
||||
// When q == 254, the input char is '='.
|
||||
// Check if next byte is also '=':
|
||||
if (q == 254) {
|
||||
if (slen-- != 0) {
|
||||
st.bytes = 0;
|
||||
// EOF:
|
||||
st.eof = BASE64_EOF;
|
||||
q = base64_table_dec_8bit[*s++];
|
||||
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
// Almost EOF
|
||||
st.eof = BASE64_AEOF;
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If we get here, there was an error:
|
||||
break;
|
||||
}
|
||||
*o++ = st.carry | (q >> 2);
|
||||
st.carry = q << 6;
|
||||
st.bytes++;
|
||||
olen++;
|
||||
|
||||
// Deliberate fallthrough:
|
||||
BASE64_FALLTHROUGH
|
||||
|
||||
case 3: if (slen-- == 0) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
||||
st.bytes = 0;
|
||||
st.eof = BASE64_EOF;
|
||||
// When q == 254, the input char is '='. Return 1 and EOF.
|
||||
// When q == 255, the input char is invalid. Return 0 and EOF.
|
||||
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
|
||||
break;
|
||||
}
|
||||
*o++ = st.carry | q;
|
||||
st.carry = 0;
|
||||
st.bytes = 0;
|
||||
olen++;
|
||||
}
|
||||
}
|
||||
|
||||
state->eof = st.eof;
|
||||
state->bytes = st.bytes;
|
||||
state->carry = st.carry;
|
||||
*outlen = olen;
|
||||
return ret;
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
// Assume that *out is large enough to contain the output.
|
||||
// Theoretically it should be 4/3 the length of src.
|
||||
const uint8_t *s = (const uint8_t *) src;
|
||||
uint8_t *o = (uint8_t *) out;
|
||||
|
||||
// Use local temporaries to avoid cache thrashing:
|
||||
size_t olen = 0;
|
||||
size_t slen = srclen;
|
||||
struct base64_state st;
|
||||
st.bytes = state->bytes;
|
||||
st.carry = state->carry;
|
||||
|
||||
// Turn three bytes into four 6-bit numbers:
|
||||
// in[0] = 00111111
|
||||
// in[1] = 00112222
|
||||
// in[2] = 00222233
|
||||
// in[3] = 00333333
|
||||
|
||||
// Duff's device, a for() loop inside a switch() statement. Legal!
|
||||
switch (st.bytes)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
case 0:
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
if (slen-- == 0) {
|
||||
break;
|
||||
}
|
||||
*o++ = base64_table_enc_6bit[*s >> 2];
|
||||
st.carry = (*s++ << 4) & 0x30;
|
||||
st.bytes++;
|
||||
olen += 1;
|
||||
|
||||
// Deliberate fallthrough:
|
||||
BASE64_FALLTHROUGH
|
||||
|
||||
case 1: if (slen-- == 0) {
|
||||
break;
|
||||
}
|
||||
*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
|
||||
st.carry = (*s++ << 2) & 0x3C;
|
||||
st.bytes++;
|
||||
olen += 1;
|
||||
|
||||
// Deliberate fallthrough:
|
||||
BASE64_FALLTHROUGH
|
||||
|
||||
case 2: if (slen-- == 0) {
|
||||
break;
|
||||
}
|
||||
*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
|
||||
*o++ = base64_table_enc_6bit[*s++ & 0x3F];
|
||||
st.bytes = 0;
|
||||
olen += 2;
|
||||
}
|
||||
}
|
||||
state->bytes = st.bytes;
|
||||
state->carry = st.carry;
|
||||
*outlen = olen;
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#ifdef __arm__
|
||||
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
|
||||
# define BASE64_USE_NEON32
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef BASE64_USE_NEON32
|
||||
#include <arm_neon.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers.
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
#define BASE64_NEON32_USE_ASM
|
||||
#endif
|
||||
|
||||
static BASE64_FORCE_INLINE uint8x16_t
|
||||
vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
|
||||
{
|
||||
// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
|
||||
// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
|
||||
uint8x8x2_t lut2;
|
||||
uint8x8x2_t result;
|
||||
|
||||
lut2.val[0] = vget_low_u8(lut);
|
||||
lut2.val[1] = vget_high_u8(lut);
|
||||
|
||||
result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
|
||||
result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
|
||||
|
||||
return vcombine_u8(result.val[0], result.val[1]);
|
||||
}
|
||||
|
||||
#include "../generic/32/dec_loop.c"
|
||||
#include "../generic/32/enc_loop.c"
|
||||
#include "dec_loop.c"
|
||||
#include "enc_reshuffle.c"
|
||||
#include "enc_translate.c"
|
||||
#include "enc_loop.c"
|
||||
|
||||
#endif // BASE64_USE_NEON32
|
||||
|
||||
// Stride size is so large on these NEON 32-bit functions
|
||||
// (48 bytes encode, 32 bytes decode) that we inline the
|
||||
// uint32 codec to stay performant on smaller inputs.
|
||||
|
||||
void
|
||||
base64_stream_encode_neon32 BASE64_ENC_PARAMS
|
||||
{
|
||||
#ifdef BASE64_USE_NEON32
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_neon32(&s, &slen, &o, &olen);
|
||||
enc_loop_generic_32(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_neon32 BASE64_DEC_PARAMS
|
||||
{
|
||||
#ifdef BASE64_USE_NEON32
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_neon32(&s, &slen, &o, &olen);
|
||||
dec_loop_generic_32(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
static BASE64_FORCE_INLINE int
|
||||
is_nonzero (const uint8x16_t v)
|
||||
{
|
||||
uint64_t u64;
|
||||
const uint64x2_t v64 = vreinterpretq_u64_u8(v);
|
||||
const uint32x2_t v32 = vqmovn_u64(v64);
|
||||
|
||||
vst1_u64(&u64, vreinterpret_u64_u32(v32));
|
||||
return u64 != 0;
|
||||
}
|
||||
|
||||
static BASE64_FORCE_INLINE uint8x16_t
|
||||
delta_lookup (const uint8x16_t v)
|
||||
{
|
||||
const uint8x8_t lut = {
|
||||
0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
|
||||
};
|
||||
|
||||
return vcombine_u8(
|
||||
vtbl1_u8(lut, vget_low_u8(v)),
|
||||
vtbl1_u8(lut, vget_high_u8(v)));
|
||||
}
|
||||
|
||||
static BASE64_FORCE_INLINE uint8x16_t
|
||||
dec_loop_neon32_lane (uint8x16_t *lane)
|
||||
{
|
||||
// See the SSSE3 decoder for an explanation of the algorithm.
|
||||
const uint8x16_t lut_lo = {
|
||||
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
||||
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
|
||||
};
|
||||
|
||||
const uint8x16_t lut_hi = {
|
||||
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
||||
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
|
||||
};
|
||||
|
||||
const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
|
||||
const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
|
||||
|
||||
const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
|
||||
const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
|
||||
const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
|
||||
|
||||
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
|
||||
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
|
||||
|
||||
// Now simply add the delta values to the input:
|
||||
*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
|
||||
|
||||
// Return the validity mask:
|
||||
return vandq_u8(lo, hi);
|
||||
}
|
||||
|
||||
static inline void
|
||||
dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 64) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
|
||||
// extra trailing zero bytes are written, so it is not necessary to
|
||||
// reserve extra input bytes:
|
||||
size_t rounds = *slen / 64;
|
||||
|
||||
*slen -= rounds * 64; // 64 bytes consumed per round
|
||||
*olen += rounds * 48; // 48 bytes produced per round
|
||||
|
||||
do {
|
||||
uint8x16x3_t dec;
|
||||
|
||||
// Load 64 bytes and deinterleave:
|
||||
uint8x16x4_t str = vld4q_u8(*s);
|
||||
|
||||
// Decode each lane, collect a mask of invalid inputs:
|
||||
const uint8x16_t classified
|
||||
= dec_loop_neon32_lane(&str.val[0])
|
||||
| dec_loop_neon32_lane(&str.val[1])
|
||||
| dec_loop_neon32_lane(&str.val[2])
|
||||
| dec_loop_neon32_lane(&str.val[3]);
|
||||
|
||||
// Check for invalid input: if any of the delta values are
|
||||
// zero, fall back on bytewise code to do error checking and
|
||||
// reporting:
|
||||
if (is_nonzero(classified)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Compress four bytes into three:
|
||||
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
|
||||
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
|
||||
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
|
||||
|
||||
// Interleave and store decoded result:
|
||||
vst3q_u8(*o, dec);
|
||||
|
||||
*s += 64;
|
||||
*o += 48;
|
||||
|
||||
} while (--rounds > 0);
|
||||
|
||||
// Adjust for any rounds that were skipped:
|
||||
*slen += rounds * 64;
|
||||
*olen -= rounds * 48;
|
||||
}
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
#ifdef BASE64_NEON32_USE_ASM
|
||||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
// This function duplicates the functionality of enc_loop_neon32_inner,
|
||||
// but entirely with inline assembly. This gives a significant speedup
|
||||
// over using NEON intrinsics, which do not always generate very good
|
||||
// code. The logic of the assembly is directly lifted from the
|
||||
// intrinsics version, so it can be used as a guide to this code.
|
||||
|
||||
// Temporary registers, used as scratch space.
|
||||
uint8x16_t tmp0, tmp1, tmp2, tmp3;
|
||||
uint8x16_t mask0, mask1, mask2, mask3;
|
||||
|
||||
// A lookup table containing the absolute offsets for all ranges.
|
||||
const uint8x16_t lut = {
|
||||
65U, 71U, 252U, 252U,
|
||||
252U, 252U, 252U, 252U,
|
||||
252U, 252U, 252U, 252U,
|
||||
237U, 240U, 0U, 0U
|
||||
};
|
||||
|
||||
// Numeric constants.
|
||||
const uint8x16_t n51 = vdupq_n_u8(51);
|
||||
const uint8x16_t n25 = vdupq_n_u8(25);
|
||||
const uint8x16_t n63 = vdupq_n_u8(63);
|
||||
|
||||
__asm__ (
|
||||
|
||||
// Load 48 bytes and deinterleave. The bytes are loaded to
|
||||
// hard-coded registers q12, q13 and q14, to ensure that they
|
||||
// are contiguous. Increment the source pointer.
|
||||
"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
|
||||
"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
|
||||
|
||||
// Reshuffle the bytes using temporaries.
|
||||
"vshr.u8 %q[t0], q12, #2 \n\t"
|
||||
"vshr.u8 %q[t1], q13, #4 \n\t"
|
||||
"vshr.u8 %q[t2], q14, #6 \n\t"
|
||||
"vsli.8 %q[t1], q12, #4 \n\t"
|
||||
"vsli.8 %q[t2], q13, #2 \n\t"
|
||||
"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
|
||||
"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
|
||||
"vand.u8 %q[t3], q14, %q[n63] \n\t"
|
||||
|
||||
// t0..t3 are the reshuffled inputs. Create LUT indices.
|
||||
"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
|
||||
"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
|
||||
"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
|
||||
"vqsub.u8 q15, %q[t3], %q[n51] \n\t"
|
||||
|
||||
// Create the mask for range #0.
|
||||
"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
|
||||
"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
|
||||
"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
|
||||
"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
|
||||
|
||||
// Subtract -1 to correct the LUT indices.
|
||||
"vsub.u8 q12, %q[m0] \n\t"
|
||||
"vsub.u8 q13, %q[m1] \n\t"
|
||||
"vsub.u8 q14, %q[m2] \n\t"
|
||||
"vsub.u8 q15, %q[m3] \n\t"
|
||||
|
||||
// Lookup the delta values.
|
||||
"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
|
||||
"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
|
||||
"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
|
||||
"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
|
||||
"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
|
||||
"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
|
||||
"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
|
||||
"vtbl.u8 d31, {%q[lut]}, d31 \n\t"
|
||||
|
||||
// Add the delta values.
|
||||
"vadd.u8 q12, %q[t0] \n\t"
|
||||
"vadd.u8 q13, %q[t1] \n\t"
|
||||
"vadd.u8 q14, %q[t2] \n\t"
|
||||
"vadd.u8 q15, %q[t3] \n\t"
|
||||
|
||||
// Store 64 bytes and interleave. Increment the dest pointer.
|
||||
"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
|
||||
"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[t0] "=&w" (tmp0),
|
||||
[t1] "=&w" (tmp1),
|
||||
[t2] "=&w" (tmp2),
|
||||
[t3] "=&w" (tmp3),
|
||||
[m0] "=&w" (mask0),
|
||||
[m1] "=&w" (mask1),
|
||||
[m2] "=&w" (mask2),
|
||||
[m3] "=&w" (mask3)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut] "w" (lut),
|
||||
[n25] "w" (n25),
|
||||
[n51] "w" (n51),
|
||||
[n63] "w" (n63)
|
||||
|
||||
// Clobbers.
|
||||
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
|
||||
"cc", "memory"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
#ifdef BASE64_NEON32_USE_ASM
|
||||
enc_loop_neon32_inner_asm(s, o);
|
||||
#else
|
||||
// Load 48 bytes and deinterleave:
|
||||
uint8x16x3_t src = vld3q_u8(*s);
|
||||
|
||||
// Reshuffle:
|
||||
uint8x16x4_t out = enc_reshuffle(src);
|
||||
|
||||
// Translate reshuffled bytes to the Base64 alphabet:
|
||||
out = enc_translate(out);
|
||||
|
||||
// Interleave and store output:
|
||||
vst4q_u8(*o, out);
|
||||
|
||||
*s += 48;
|
||||
*o += 64;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
size_t rounds = *slen / 48;
|
||||
|
||||
*slen -= rounds * 48; // 48 bytes consumed per round
|
||||
*olen += rounds * 64; // 64 bytes produced per round
|
||||
|
||||
while (rounds > 0) {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_neon32_inner(s, o);
|
||||
enc_loop_neon32_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_neon32_inner(s, o);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
static BASE64_FORCE_INLINE uint8x16x4_t
|
||||
enc_reshuffle (uint8x16x3_t in)
|
||||
{
|
||||
uint8x16x4_t out;
|
||||
|
||||
// Input:
|
||||
// in[0] = a7 a6 a5 a4 a3 a2 a1 a0
|
||||
// in[1] = b7 b6 b5 b4 b3 b2 b1 b0
|
||||
// in[2] = c7 c6 c5 c4 c3 c2 c1 c0
|
||||
|
||||
// Output:
|
||||
// out[0] = 00 00 a7 a6 a5 a4 a3 a2
|
||||
// out[1] = 00 00 a1 a0 b7 b6 b5 b4
|
||||
// out[2] = 00 00 b3 b2 b1 b0 c7 c6
|
||||
// out[3] = 00 00 c5 c4 c3 c2 c1 c0
|
||||
|
||||
// Move the input bits to where they need to be in the outputs. Except
|
||||
// for the first output, the high two bits are not cleared.
|
||||
out.val[0] = vshrq_n_u8(in.val[0], 2);
|
||||
out.val[1] = vshrq_n_u8(in.val[1], 4);
|
||||
out.val[2] = vshrq_n_u8(in.val[2], 6);
|
||||
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
|
||||
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
|
||||
|
||||
// Clear the high two bits in the second, third and fourth output.
|
||||
out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
|
||||
out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
|
||||
out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
|
||||
|
||||
return out;
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
static BASE64_FORCE_INLINE uint8x16x4_t
|
||||
enc_translate (const uint8x16x4_t in)
|
||||
{
|
||||
// A lookup table containing the absolute offsets for all ranges:
|
||||
const uint8x16_t lut = {
|
||||
65U, 71U, 252U, 252U,
|
||||
252U, 252U, 252U, 252U,
|
||||
252U, 252U, 252U, 252U,
|
||||
237U, 240U, 0U, 0U
|
||||
};
|
||||
|
||||
const uint8x16_t offset = vdupq_n_u8(51);
|
||||
|
||||
uint8x16x4_t indices, mask, delta, out;
|
||||
|
||||
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
||||
// # From To Abs Index Characters
|
||||
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
||||
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
||||
// 3 [62] [43] -19 12 +
|
||||
// 4 [63] [47] -16 13 /
|
||||
|
||||
// Create LUT indices from input:
|
||||
// the index for range #0 is right, others are 1 less than expected:
|
||||
indices.val[0] = vqsubq_u8(in.val[0], offset);
|
||||
indices.val[1] = vqsubq_u8(in.val[1], offset);
|
||||
indices.val[2] = vqsubq_u8(in.val[2], offset);
|
||||
indices.val[3] = vqsubq_u8(in.val[3], offset);
|
||||
|
||||
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
||||
mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
|
||||
mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
|
||||
mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
|
||||
mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
|
||||
|
||||
// Subtract -1, so add 1 to indices for range #[1..4], All indices are
|
||||
// now correct:
|
||||
indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
|
||||
indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
|
||||
indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
|
||||
indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
|
||||
|
||||
// Lookup delta values:
|
||||
delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
|
||||
delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
|
||||
delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
|
||||
delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
|
||||
|
||||
// Add delta values:
|
||||
out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
|
||||
out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
|
||||
out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
|
||||
out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_NEON64
|
||||
#include <arm_neon.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers.
|
||||
#if !defined(__wasm__) && (defined(__GNUC__) || defined(__clang__))
|
||||
#define BASE64_NEON64_USE_ASM
|
||||
#endif
|
||||
|
||||
static BASE64_FORCE_INLINE uint8x16x4_t
|
||||
load_64byte_table (const uint8_t *p)
|
||||
{
|
||||
#ifdef BASE64_NEON64_USE_ASM
|
||||
|
||||
// Force the table to be loaded into contiguous registers. GCC will not
|
||||
// normally allocate contiguous registers for a `uint8x16x4_t'. These
|
||||
// registers are chosen to not conflict with the ones in the enc loop.
|
||||
register uint8x16_t t0 __asm__ ("v8");
|
||||
register uint8x16_t t1 __asm__ ("v9");
|
||||
register uint8x16_t t2 __asm__ ("v10");
|
||||
register uint8x16_t t3 __asm__ ("v11");
|
||||
|
||||
__asm__ (
|
||||
"ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
|
||||
: [src] "+r" (p),
|
||||
[t0] "=w" (t0),
|
||||
[t1] "=w" (t1),
|
||||
[t2] "=w" (t2),
|
||||
[t3] "=w" (t3)
|
||||
);
|
||||
|
||||
return (uint8x16x4_t) {
|
||||
.val[0] = t0,
|
||||
.val[1] = t1,
|
||||
.val[2] = t2,
|
||||
.val[3] = t3,
|
||||
};
|
||||
#else
|
||||
return vld1q_u8_x4(p);
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "../generic/32/dec_loop.c"
|
||||
#include "../generic/64/enc_loop.c"
|
||||
#include "dec_loop.c"
|
||||
|
||||
#ifdef BASE64_NEON64_USE_ASM
|
||||
# include "enc_loop_asm.c"
|
||||
#else
|
||||
# include "enc_reshuffle.c"
|
||||
# include "enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_NEON64
|
||||
|
||||
// Stride size is so large on these NEON 64-bit functions
|
||||
// (48 bytes encode, 64 bytes decode) that we inline the
|
||||
// uint64 codec to stay performant on smaller inputs.
|
||||
|
||||
void
|
||||
base64_stream_encode_neon64 BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_NEON64
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_neon64(&s, &slen, &o, &olen);
|
||||
enc_loop_generic_64(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_neon64 BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_NEON64
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_neon64(&s, &slen, &o, &olen);
|
||||
dec_loop_generic_32(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
// The input consists of five valid character sets in the Base64 alphabet,
|
||||
// which we need to map back to the 6-bit values they represent.
|
||||
// There are three ranges, two singles, and then there's the rest.
|
||||
//
|
||||
// # From To LUT Characters
|
||||
// 1 [0..42] [255] #1 invalid input
|
||||
// 2 [43] [62] #1 +
|
||||
// 3 [44..46] [255] #1 invalid input
|
||||
// 4 [47] [63] #1 /
|
||||
// 5 [48..57] [52..61] #1 0..9
|
||||
// 6 [58..63] [255] #1 invalid input
|
||||
// 7 [64] [255] #2 invalid input
|
||||
// 8 [65..90] [0..25] #2 A..Z
|
||||
// 9 [91..96] [255] #2 invalid input
|
||||
// 10 [97..122] [26..51] #2 a..z
|
||||
// 11 [123..126] [255] #2 invalid input
|
||||
// (12) Everything else => invalid input
|
||||
|
||||
// The first LUT will use the VTBL instruction (out of range indices are set to
|
||||
// 0 in destination).
|
||||
static const uint8_t dec_lut1[] = {
|
||||
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
|
||||
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
|
||||
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
|
||||
52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
|
||||
};
|
||||
|
||||
// The second LUT will use the VTBX instruction (out of range indices will be
|
||||
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
|
||||
// in this LUT. Index 0 means that value comes from LUT #1.
|
||||
static const uint8_t dec_lut2[] = {
|
||||
0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
|
||||
14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
|
||||
255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
|
||||
40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
|
||||
};
|
||||
|
||||
// All input values in range for the first look-up will be 0U in the second
|
||||
// look-up result. All input values out of range for the first look-up will be
|
||||
// 0U in the first look-up result. Thus, the two results can be ORed without
|
||||
// conflicts.
|
||||
//
|
||||
// Invalid characters that are in the valid range for either look-up will be
|
||||
// set to 255U in the combined result. Other invalid characters will just be
|
||||
// passed through with the second look-up result (using the VTBX instruction).
|
||||
// Since the second LUT is 64 bytes, those passed-through values are guaranteed
|
||||
// to have a value greater than 63U. Therefore, valid characters will be mapped
|
||||
// to the valid [0..63] range and all invalid characters will be mapped to
|
||||
// values greater than 63.
|
||||
|
||||
static inline void
|
||||
dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 64) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
|
||||
// extra trailing zero bytes are written, so it is not necessary to
|
||||
// reserve extra input bytes:
|
||||
size_t rounds = *slen / 64;
|
||||
|
||||
*slen -= rounds * 64; // 64 bytes consumed per round
|
||||
*olen += rounds * 48; // 48 bytes produced per round
|
||||
|
||||
const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
|
||||
const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
|
||||
|
||||
do {
|
||||
const uint8x16_t offset = vdupq_n_u8(63U);
|
||||
uint8x16x4_t dec1, dec2;
|
||||
uint8x16x3_t dec;
|
||||
|
||||
// Load 64 bytes and deinterleave:
|
||||
uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
|
||||
|
||||
// Get indices for second LUT:
|
||||
dec2.val[0] = vqsubq_u8(str.val[0], offset);
|
||||
dec2.val[1] = vqsubq_u8(str.val[1], offset);
|
||||
dec2.val[2] = vqsubq_u8(str.val[2], offset);
|
||||
dec2.val[3] = vqsubq_u8(str.val[3], offset);
|
||||
|
||||
// Get values from first LUT:
|
||||
dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
|
||||
dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
|
||||
dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
|
||||
dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
|
||||
|
||||
// Get values from second LUT:
|
||||
dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
|
||||
dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
|
||||
dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
|
||||
dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
|
||||
|
||||
// Get final values:
|
||||
str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
|
||||
str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
|
||||
str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
|
||||
str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
|
||||
|
||||
// Check for invalid input, any value larger than 63:
|
||||
const uint8x16_t classified
|
||||
= vorrq_u8(
|
||||
vorrq_u8(vcgtq_u8(str.val[0], vdupq_n_u8(63)), vcgtq_u8(str.val[1], vdupq_n_u8(63))),
|
||||
vorrq_u8(vcgtq_u8(str.val[2], vdupq_n_u8(63)), vcgtq_u8(str.val[3], vdupq_n_u8(63)))
|
||||
);
|
||||
|
||||
// Check that all bits are zero:
|
||||
if (vmaxvq_u8(classified) != 0U) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Compress four bytes into three:
|
||||
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
|
||||
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
|
||||
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
|
||||
|
||||
// Interleave and store decoded result:
|
||||
vst3q_u8((uint8_t *) *o, dec);
|
||||
|
||||
*s += 64;
|
||||
*o += 48;
|
||||
|
||||
} while (--rounds > 0);
|
||||
|
||||
// Adjust for any rounds that were skipped:
|
||||
*slen += rounds * 64;
|
||||
*olen -= rounds * 48;
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
|
||||
{
|
||||
// Load 48 bytes and deinterleave:
|
||||
uint8x16x3_t src = vld3q_u8(*s);
|
||||
|
||||
// Divide bits of three input bytes over four output bytes:
|
||||
uint8x16x4_t out = enc_reshuffle(src);
|
||||
|
||||
// The bits have now been shifted to the right locations;
|
||||
// translate their values 0..63 to the Base64 alphabet.
|
||||
// Use a 64-byte table lookup:
|
||||
out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
|
||||
out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
|
||||
out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
|
||||
out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
|
||||
|
||||
// Interleave and store output:
|
||||
vst4q_u8(*o, out);
|
||||
|
||||
*s += 48;
|
||||
*o += 64;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
size_t rounds = *slen / 48;
|
||||
|
||||
*slen -= rounds * 48; // 48 bytes consumed per round
|
||||
*olen += rounds * 64; // 64 bytes produced per round
|
||||
|
||||
// Load the encoding table:
|
||||
const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
|
||||
|
||||
while (rounds > 0) {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_neon64_inner(s, o, tbl_enc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads three user-defined registers
|
||||
// A, B, C from memory and deinterleaves them, post-incrementing the src
|
||||
// pointer. The register set should be sequential.
|
||||
#define LOAD(A, B, C) \
|
||||
"ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes three deinterleaved registers
|
||||
// and shuffles the bytes. The output is in temporary registers t0..t3.
|
||||
#define SHUF(A, B, C) \
|
||||
"ushr %[t0].16b, "A".16b, #2 \n\t" \
|
||||
"ushr %[t1].16b, "B".16b, #4 \n\t" \
|
||||
"ushr %[t2].16b, "C".16b, #6 \n\t" \
|
||||
"sli %[t1].16b, "A".16b, #4 \n\t" \
|
||||
"sli %[t2].16b, "B".16b, #2 \n\t" \
|
||||
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
|
||||
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
|
||||
"and %[t3].16b, "C".16b, %[n63].16b \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes temporary registers t0..t3
|
||||
// and translates them to the base64 alphabet, using a table loaded into
|
||||
// v8..v11. The output is in user-defined registers A..D.
|
||||
#define TRAN(A, B, C, D) \
|
||||
"tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
|
||||
"tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
|
||||
"tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
|
||||
"tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
|
||||
|
||||
// Generate a block of inline assembly that interleaves four registers and
|
||||
// stores them, post-incrementing the destination pointer.
|
||||
#define STOR(A, B, C, D) \
|
||||
"st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result.
|
||||
#define ROUND() \
|
||||
LOAD("v12", "v13", "v14") \
|
||||
SHUF("v12", "v13", "v14") \
|
||||
TRAN("v12", "v13", "v14", "v15") \
|
||||
STOR("v12", "v13", "v14", "v15")
|
||||
|
||||
// Generate a block of assembly that generates a type A interleaved encoder
|
||||
// round. It uses registers that were loaded by the previous type B round, and
|
||||
// in turn loads registers for the next type B round.
|
||||
#define ROUND_A() \
|
||||
SHUF("v2", "v3", "v4") \
|
||||
LOAD("v12", "v13", "v14") \
|
||||
TRAN("v2", "v3", "v4", "v5") \
|
||||
STOR("v2", "v3", "v4", "v5")
|
||||
|
||||
// Type B interleaved encoder round. Same as type A, but register sets swapped.
|
||||
#define ROUND_B() \
|
||||
SHUF("v12", "v13", "v14") \
|
||||
LOAD("v2", "v3", "v4") \
|
||||
TRAN("v12", "v13", "v14", "v15") \
|
||||
STOR("v12", "v13", "v14", "v15")
|
||||
|
||||
// The first type A round needs to load its own registers.
|
||||
#define ROUND_A_FIRST() \
|
||||
LOAD("v2", "v3", "v4") \
|
||||
ROUND_A()
|
||||
|
||||
// The last type B round omits the load for the next step.
|
||||
#define ROUND_B_LAST() \
|
||||
SHUF("v12", "v13", "v14") \
|
||||
TRAN("v12", "v13", "v14", "v15") \
|
||||
STOR("v12", "v13", "v14", "v15")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
size_t rounds = *slen / 48;
|
||||
|
||||
if (rounds == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
*slen -= rounds * 48; // 48 bytes consumed per round.
|
||||
*olen += rounds * 64; // 64 bytes produced per round.
|
||||
|
||||
// Number of times to go through the 8x loop.
|
||||
size_t loops = rounds / 8;
|
||||
|
||||
// Number of rounds remaining after the 8x loop.
|
||||
rounds %= 8;
|
||||
|
||||
// Temporary registers, used as scratch space.
|
||||
uint8x16_t tmp0, tmp1, tmp2, tmp3;
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// Load the encoding table into v8..v11.
|
||||
" ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
|
||||
|
||||
// If there are eight rounds or more, enter an 8x unrolled loop
|
||||
// of interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations to maximize
|
||||
// pipeline throughput.
|
||||
" cbz %[loops], 4f \n\t"
|
||||
|
||||
// The SIMD instructions do not touch the flags.
|
||||
"88: subs %[loops], %[loops], #1 \n\t"
|
||||
" " ROUND_A_FIRST()
|
||||
" " ROUND_B()
|
||||
" " ROUND_A()
|
||||
" " ROUND_B()
|
||||
" " ROUND_A()
|
||||
" " ROUND_B()
|
||||
" " ROUND_A()
|
||||
" " ROUND_B_LAST()
|
||||
" b.ne 88b \n\t"
|
||||
|
||||
// Enter a 4x unrolled loop for rounds of 4 or more.
|
||||
"4: cmp %[rounds], #4 \n\t"
|
||||
" b.lt 30f \n\t"
|
||||
" " ROUND_A_FIRST()
|
||||
" " ROUND_B()
|
||||
" " ROUND_A()
|
||||
" " ROUND_B_LAST()
|
||||
" sub %[rounds], %[rounds], #4 \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..3.
|
||||
"30: cbz %[rounds], 0f \n\t"
|
||||
" cmp %[rounds], #2 \n\t"
|
||||
" b.eq 2f \n\t"
|
||||
" b.lt 1f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[t0] "=&w" (tmp0),
|
||||
[t1] "=&w" (tmp1),
|
||||
[t2] "=&w" (tmp2),
|
||||
[t3] "=&w" (tmp3)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [rounds] "r" (rounds),
|
||||
[tbl] "r" (base64_table_enc_6bit),
|
||||
[n63] "w" (vdupq_n_u8(63))
|
||||
|
||||
// Clobbers.
|
||||
: "v2", "v3", "v4", "v5",
|
||||
"v8", "v9", "v10", "v11",
|
||||
"v12", "v13", "v14", "v15",
|
||||
"cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
static BASE64_FORCE_INLINE uint8x16x4_t
|
||||
enc_reshuffle (const uint8x16x3_t in)
|
||||
{
|
||||
uint8x16x4_t out;
|
||||
|
||||
// Input:
|
||||
// in[0] = a7 a6 a5 a4 a3 a2 a1 a0
|
||||
// in[1] = b7 b6 b5 b4 b3 b2 b1 b0
|
||||
// in[2] = c7 c6 c5 c4 c3 c2 c1 c0
|
||||
|
||||
// Output:
|
||||
// out[0] = 00 00 a7 a6 a5 a4 a3 a2
|
||||
// out[1] = 00 00 a1 a0 b7 b6 b5 b4
|
||||
// out[2] = 00 00 b3 b2 b1 b0 c7 c6
|
||||
// out[3] = 00 00 c5 c4 c3 c2 c1 c0
|
||||
|
||||
// Move the input bits to where they need to be in the outputs. Except
|
||||
// for the first output, the high two bits are not cleared.
|
||||
out.val[0] = vshrq_n_u8(in.val[0], 2);
|
||||
out.val[1] = vshrq_n_u8(in.val[1], 4);
|
||||
out.val[2] = vshrq_n_u8(in.val[2], 6);
|
||||
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
|
||||
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
|
||||
|
||||
// Clear the high two bits in the second, third and fourth output.
|
||||
out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
|
||||
out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
|
||||
out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
|
||||
|
||||
return out;
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_SSE41
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push (__attribute__((target("sse4.1"))), apply_to=function)
|
||||
#else
|
||||
#pragma GCC target("sse4.1")
|
||||
#endif
|
||||
#include <smmintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_SSE41_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_SSE41_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_SSE41_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "../ssse3/dec_reshuffle.c"
|
||||
#include "../ssse3/dec_loop.c"
|
||||
|
||||
#if BASE64_SSE41_USE_ASM
|
||||
# include "../ssse3/enc_loop_asm.c"
|
||||
#else
|
||||
# include "../ssse3/enc_translate.c"
|
||||
# include "../ssse3/enc_reshuffle.c"
|
||||
# include "../ssse3/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SSE41
|
||||
|
||||
void
|
||||
base64_stream_encode_sse41 BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_SSE41
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_sse41 BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_SSE41
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#endif
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_SSE42
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push (__attribute__((target("sse4.2"))), apply_to=function)
|
||||
#else
|
||||
#pragma GCC target("sse4.2")
|
||||
#endif
|
||||
#include <nmmintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
#ifndef BASE64_SSE42_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_SSE42_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_SSE42_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "../ssse3/dec_reshuffle.c"
|
||||
#include "../ssse3/dec_loop.c"
|
||||
|
||||
#if BASE64_SSE42_USE_ASM
|
||||
# include "../ssse3/enc_loop_asm.c"
|
||||
#else
|
||||
# include "../ssse3/enc_translate.c"
|
||||
# include "../ssse3/enc_reshuffle.c"
|
||||
# include "../ssse3/enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SSE42
|
||||
|
||||
void
|
||||
base64_stream_encode_sse42 BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_SSE42
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_sse42 BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_SSE42
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#endif
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libbase64.h"
|
||||
#include "../../tables/tables.h"
|
||||
#include "../../codecs.h"
|
||||
#include "config.h"
|
||||
#include "../../env.h"
|
||||
|
||||
#if HAVE_SSSE3
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push (__attribute__((target("ssse3"))), apply_to=function)
|
||||
#else
|
||||
#pragma GCC target("ssse3")
|
||||
#endif
|
||||
#include <tmmintrin.h>
|
||||
|
||||
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
|
||||
// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
|
||||
// registers, which is not enough to run the inline assembly.
|
||||
#ifndef BASE64_SSSE3_USE_ASM
|
||||
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
|
||||
# define BASE64_SSSE3_USE_ASM 1
|
||||
# else
|
||||
# define BASE64_SSSE3_USE_ASM 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "dec_reshuffle.c"
|
||||
#include "dec_loop.c"
|
||||
|
||||
#if BASE64_SSSE3_USE_ASM
|
||||
# include "enc_loop_asm.c"
|
||||
#else
|
||||
# include "enc_reshuffle.c"
|
||||
# include "enc_translate.c"
|
||||
# include "enc_loop.c"
|
||||
#endif
|
||||
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
void
|
||||
base64_stream_encode_ssse3 BASE64_ENC_PARAMS
|
||||
{
|
||||
#if HAVE_SSSE3
|
||||
#include "../generic/enc_head.c"
|
||||
enc_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/enc_tail.c"
|
||||
#else
|
||||
base64_enc_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
||||
int
|
||||
base64_stream_decode_ssse3 BASE64_DEC_PARAMS
|
||||
{
|
||||
#if HAVE_SSSE3
|
||||
#include "../generic/dec_head.c"
|
||||
dec_loop_ssse3(&s, &slen, &o, &olen);
|
||||
#include "../generic/dec_tail.c"
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#endif
|
||||
#else
|
||||
return base64_dec_stub(state, src, srclen, out, outlen);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
// The input consists of six character sets in the Base64 alphabet, which we
|
||||
// need to map back to the 6-bit values they represent. There are three ranges,
|
||||
// two singles, and then there's the rest.
|
||||
//
|
||||
// # From To Add Characters
|
||||
// 1 [43] [62] +19 +
|
||||
// 2 [47] [63] +16 /
|
||||
// 3 [48..57] [52..61] +4 0..9
|
||||
// 4 [65..90] [0..25] -65 A..Z
|
||||
// 5 [97..122] [26..51] -71 a..z
|
||||
// (6) Everything else => invalid input
|
||||
//
|
||||
// We will use lookup tables for character validation and offset computation.
|
||||
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
|
||||
// allows to mask with 0x2F instead of 0x0F and thus save one constant
|
||||
// declaration (register and/or memory access).
|
||||
//
|
||||
// For offsets:
|
||||
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
|
||||
// 0000 = garbage
|
||||
// 0001 = /
|
||||
// 0010 = +
|
||||
// 0011 = 0-9
|
||||
// 0100 = A-Z
|
||||
// 0101 = A-Z
|
||||
// 0110 = a-z
|
||||
// 0111 = a-z
|
||||
// 1000 >= garbage
|
||||
//
|
||||
// For validation, here's the table.
|
||||
// A character is valid if and only if the AND of the 2 lookups equals 0:
|
||||
//
|
||||
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
|
||||
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
|
||||
//
|
||||
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
|
||||
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
//
|
||||
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
|
||||
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
//
|
||||
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
|
||||
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
|
||||
//
|
||||
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
||||
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
|
||||
//
|
||||
// 0100 0x04 char @ A B C D E F G H I J K L M N O
|
||||
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
||||
//
|
||||
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
|
||||
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
|
||||
//
|
||||
// 0110 0x04 char ` a b c d e f g h i j k l m n o
|
||||
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
||||
// 0111 0x08 char p q r s t u v w x y z { | } ~
|
||||
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
|
||||
//
|
||||
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
||||
|
||||
static BASE64_FORCE_INLINE int
|
||||
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
||||
{
|
||||
const __m128i lut_lo = _mm_setr_epi8(
|
||||
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
||||
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
|
||||
|
||||
const __m128i lut_hi = _mm_setr_epi8(
|
||||
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
||||
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
|
||||
|
||||
const __m128i lut_roll = _mm_setr_epi8(
|
||||
0, 16, 19, 4, -65, -65, -71, -71,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
|
||||
const __m128i mask_2F = _mm_set1_epi8(0x2F);
|
||||
|
||||
// Load input:
|
||||
__m128i str = _mm_loadu_si128((__m128i *) *s);
|
||||
|
||||
// Table lookups:
|
||||
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
|
||||
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
|
||||
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
|
||||
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
|
||||
|
||||
// Check for invalid input: if any "and" values from lo and hi are not
|
||||
// zero, fall back on bytewise code to do error checking and reporting:
|
||||
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
|
||||
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
|
||||
|
||||
// Now simply add the delta values to the input:
|
||||
str = _mm_add_epi8(str, roll);
|
||||
|
||||
// Reshuffle the input to packed 12-byte output format:
|
||||
str = dec_reshuffle(str);
|
||||
|
||||
// Store the output:
|
||||
_mm_storeu_si128((__m128i *) *o, str);
|
||||
|
||||
*s += 16;
|
||||
*o += 12;
|
||||
*rounds -= 1;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 24) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
|
||||
// written after the output, ensure that there will be at least 8 bytes
|
||||
// of input data left to cover the gap. (6 data bytes and up to two
|
||||
// end-of-string markers.)
|
||||
size_t rounds = (*slen - 8) / 16;
|
||||
|
||||
*slen -= rounds * 16; // 16 bytes consumed per round
|
||||
*olen += rounds * 12; // 12 bytes produced per round
|
||||
|
||||
do {
|
||||
if (rounds >= 8) {
|
||||
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
||||
dec_loop_ssse3_inner(s, o, &rounds)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
dec_loop_ssse3_inner(s, o, &rounds);
|
||||
break;
|
||||
|
||||
} while (rounds > 0);
|
||||
|
||||
// Adjust for any rounds that were skipped:
|
||||
*slen += rounds * 16;
|
||||
*olen -= rounds * 12;
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
static BASE64_FORCE_INLINE __m128i
|
||||
dec_reshuffle (const __m128i in)
|
||||
{
|
||||
// in, bits, upper case are most significant bits, lower case are least significant bits
|
||||
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
||||
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
||||
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
||||
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
||||
|
||||
const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
|
||||
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
|
||||
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
|
||||
// 0000eeee FFffffff 0000DDDD DDddEEEE
|
||||
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
|
||||
|
||||
const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
|
||||
// 00000000 JJJJJJjj KKKKkkkk LLllllll
|
||||
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
|
||||
// 00000000 DDDDDDdd EEEEeeee FFffffff
|
||||
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
|
||||
|
||||
// Pack bytes together:
|
||||
return _mm_shuffle_epi8(out, _mm_setr_epi8(
|
||||
2, 1, 0,
|
||||
6, 5, 4,
|
||||
10, 9, 8,
|
||||
14, 13, 12,
|
||||
-1, -1, -1, -1));
|
||||
// 00000000 00000000 00000000 00000000
|
||||
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
|
||||
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
|
||||
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
static BASE64_FORCE_INLINE void
|
||||
enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
|
||||
{
|
||||
// Load input:
|
||||
__m128i str = _mm_loadu_si128((__m128i *) *s);
|
||||
|
||||
// Reshuffle:
|
||||
str = enc_reshuffle(str);
|
||||
|
||||
// Translate reshuffled bytes to the Base64 alphabet:
|
||||
str = enc_translate(str);
|
||||
|
||||
// Store:
|
||||
_mm_storeu_si128((__m128i *) *o, str);
|
||||
|
||||
*s += 12;
|
||||
*o += 16;
|
||||
}
|
||||
|
||||
static inline void
|
||||
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
if (*slen < 16) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 12 bytes at a time. Because blocks are loaded 16
|
||||
// bytes at a time, ensure that there will be at least 4 remaining
|
||||
// bytes after the last round, so that the final read will not pass
|
||||
// beyond the bounds of the input buffer:
|
||||
size_t rounds = (*slen - 4) / 12;
|
||||
|
||||
*slen -= rounds * 12; // 12 bytes consumed per round
|
||||
*olen += rounds * 16; // 16 bytes produced per round
|
||||
|
||||
do {
|
||||
if (rounds >= 8) {
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
rounds -= 8;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 4) {
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
rounds -= 4;
|
||||
continue;
|
||||
}
|
||||
if (rounds >= 2) {
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
rounds -= 2;
|
||||
continue;
|
||||
}
|
||||
enc_loop_ssse3_inner(s, o);
|
||||
break;
|
||||
|
||||
} while (rounds > 0);
|
||||
}
|
||||
|
|
@ -0,0 +1,268 @@
|
|||
// Apologies in advance for combining the preprocessor with inline assembly,
|
||||
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
|
||||
// code repetition. The preprocessor is used to template large sections of
|
||||
// inline assembly that differ only in the registers used. If the code was
|
||||
// written out by hand, it would become very large and hard to audit.
|
||||
|
||||
// Generate a block of inline assembly that loads register R0 from memory. The
|
||||
// offset at which the register is loaded is set by the given round.
|
||||
#define LOAD(R0, ROUND) \
|
||||
"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that deinterleaves and shuffles register
|
||||
// R0 using preloaded constants. Outputs in R0 and R1.
|
||||
#define SHUF(R0, R1) \
|
||||
"pshufb %[lut0], %["R0"] \n\t" \
|
||||
"movdqa %["R0"], %["R1"] \n\t" \
|
||||
"pand %[msk0], %["R0"] \n\t" \
|
||||
"pand %[msk2], %["R1"] \n\t" \
|
||||
"pmulhuw %[msk1], %["R0"] \n\t" \
|
||||
"pmullw %[msk3], %["R1"] \n\t" \
|
||||
"por %["R1"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that takes R0 and R1 and translates
|
||||
// their contents to the base64 alphabet, using preloaded constants.
|
||||
#define TRAN(R0, R1, R2) \
|
||||
"movdqa %["R0"], %["R1"] \n\t" \
|
||||
"movdqa %["R0"], %["R2"] \n\t" \
|
||||
"psubusb %[n51], %["R1"] \n\t" \
|
||||
"pcmpgtb %[n25], %["R2"] \n\t" \
|
||||
"psubb %["R2"], %["R1"] \n\t" \
|
||||
"movdqa %[lut1], %["R2"] \n\t" \
|
||||
"pshufb %["R1"], %["R2"] \n\t" \
|
||||
"paddb %["R2"], %["R0"] \n\t"
|
||||
|
||||
// Generate a block of inline assembly that stores the given register R0 at an
|
||||
// offset set by the given round.
|
||||
#define STOR(R0, ROUND) \
|
||||
"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
|
||||
|
||||
// Generate a block of inline assembly that generates a single self-contained
|
||||
// encoder round: fetch the data, process it, and store the result. Then update
|
||||
// the source and destination pointers.
|
||||
#define ROUND() \
|
||||
LOAD("a", 0) \
|
||||
SHUF("a", "b") \
|
||||
TRAN("a", "b", "c") \
|
||||
STOR("a", 0) \
|
||||
"add $12, %[src] \n\t" \
|
||||
"add $16, %[dst] \n\t"
|
||||
|
||||
// Define a macro that initiates a three-way interleaved encoding round by
|
||||
// preloading registers a, b and c from memory.
|
||||
// The register graph shows which registers are in use during each step, and
|
||||
// is a visual aid for choosing registers for that step. Symbol index:
|
||||
//
|
||||
// + indicates that a register is loaded by that step.
|
||||
// | indicates that a register is in use and must not be touched.
|
||||
// - indicates that a register is decommissioned by that step.
|
||||
// x indicates that a register is used as a temporary by that step.
|
||||
// V indicates that a register is an input or output to the macro.
|
||||
//
|
||||
#define ROUND_3_INIT() /* a b c d e f */ \
|
||||
LOAD("a", 0) /* + */ \
|
||||
SHUF("a", "d") /* | + */ \
|
||||
LOAD("b", 1) /* | + | */ \
|
||||
TRAN("a", "d", "e") /* | | - x */ \
|
||||
LOAD("c", 2) /* V V V */
|
||||
|
||||
// Define a macro that translates, shuffles and stores the input registers A, B
|
||||
// and C, and preloads registers D, E and F for the next round.
|
||||
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
|
||||
// and F back into the next round as input registers A, B and C. The macro
|
||||
// carefully interleaves memory operations with data operations for optimal
|
||||
// pipelined performance.
|
||||
|
||||
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
LOAD(D, (ROUND + 3)) /* V V V + */ \
|
||||
SHUF(B, E) /* | | | | + */ \
|
||||
STOR(A, (ROUND + 0)) /* - | | | | */ \
|
||||
TRAN(B, E, F) /* | | | - x */ \
|
||||
LOAD(E, (ROUND + 4)) /* | | | + */ \
|
||||
SHUF(C, A) /* + | | | | */ \
|
||||
STOR(B, (ROUND + 1)) /* | - | | | */ \
|
||||
TRAN(C, A, F) /* - | | | x */ \
|
||||
LOAD(F, (ROUND + 5)) /* | | | + */ \
|
||||
SHUF(D, A) /* + | | | | */ \
|
||||
STOR(C, (ROUND + 2)) /* | - | | | */ \
|
||||
TRAN(D, A, B) /* - x V V V */
|
||||
|
||||
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
|
||||
// registers D, E and F, and translating, shuffling and storing them.
|
||||
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
|
||||
SHUF(E, A) /* + V V V */ \
|
||||
STOR(D, (ROUND + 3)) /* | - | | */ \
|
||||
TRAN(E, A, B) /* - x | | */ \
|
||||
SHUF(F, C) /* + | | */ \
|
||||
STOR(E, (ROUND + 4)) /* | - | */ \
|
||||
TRAN(F, C, D) /* - x | */ \
|
||||
STOR(F, (ROUND + 5)) /* - */
|
||||
|
||||
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
|
||||
#define ROUND_3_A(ROUND) \
|
||||
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Define a type B round. Inputs and outputs are swapped with regard to type A.
|
||||
#define ROUND_3_B(ROUND) \
|
||||
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Terminating macro for a type A round.
|
||||
#define ROUND_3_A_LAST(ROUND) \
|
||||
ROUND_3_A(ROUND) \
|
||||
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
|
||||
|
||||
// Terminating macro for a type B round.
|
||||
#define ROUND_3_B_LAST(ROUND) \
|
||||
ROUND_3_B(ROUND) \
|
||||
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
|
||||
|
||||
// Suppress clang's warning that the literal string in the asm statement is
|
||||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
|
||||
// compilers). It may be true, but the goal here is not C99 portability.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
|
||||
static inline void
|
||||
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
||||
{
|
||||
// For a clearer explanation of the algorithm used by this function,
|
||||
// please refer to the plain (not inline assembly) implementation. This
|
||||
// function follows the same basic logic.
|
||||
|
||||
if (*slen < 16) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
|
||||
// bytes, so "reserve" four bytes from the input buffer to ensure that
|
||||
// we never read beyond the end of the input buffer.
|
||||
size_t rounds = (*slen - 4) / 12;
|
||||
|
||||
*slen -= rounds * 12; // 12 bytes consumed per round
|
||||
*olen += rounds * 16; // 16 bytes produced per round
|
||||
|
||||
// Number of times to go through the 36x loop.
|
||||
size_t loops = rounds / 36;
|
||||
|
||||
// Number of rounds remaining after the 36x loop.
|
||||
rounds %= 36;
|
||||
|
||||
// Lookup tables.
|
||||
const __m128i lut0 = _mm_set_epi8(
|
||||
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
|
||||
|
||||
const __m128i lut1 = _mm_setr_epi8(
|
||||
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
||||
|
||||
// Temporary registers.
|
||||
__m128i a, b, c, d, e, f;
|
||||
|
||||
__asm__ volatile (
|
||||
|
||||
// If there are 36 rounds or more, enter a 36x unrolled loop of
|
||||
// interleaved encoding rounds. The rounds interleave memory
|
||||
// operations (load/store) with data operations (table lookups,
|
||||
// etc) to maximize pipeline throughput.
|
||||
" test %[loops], %[loops] \n\t"
|
||||
" jz 18f \n\t"
|
||||
" jmp 36f \n\t"
|
||||
" \n\t"
|
||||
".balign 64 \n\t"
|
||||
"36: " ROUND_3_INIT()
|
||||
" " ROUND_3_A( 0)
|
||||
" " ROUND_3_B( 3)
|
||||
" " ROUND_3_A( 6)
|
||||
" " ROUND_3_B( 9)
|
||||
" " ROUND_3_A(12)
|
||||
" " ROUND_3_B(15)
|
||||
" " ROUND_3_A(18)
|
||||
" " ROUND_3_B(21)
|
||||
" " ROUND_3_A(24)
|
||||
" " ROUND_3_B(27)
|
||||
" " ROUND_3_A_LAST(30)
|
||||
" add $(12 * 36), %[src] \n\t"
|
||||
" add $(16 * 36), %[dst] \n\t"
|
||||
" dec %[loops] \n\t"
|
||||
" jnz 36b \n\t"
|
||||
|
||||
// Enter an 18x unrolled loop for rounds of 18 or more.
|
||||
"18: cmp $18, %[rounds] \n\t"
|
||||
" jl 9f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B(3)
|
||||
" " ROUND_3_A(6)
|
||||
" " ROUND_3_B(9)
|
||||
" " ROUND_3_A_LAST(12)
|
||||
" sub $18, %[rounds] \n\t"
|
||||
" add $(12 * 18), %[src] \n\t"
|
||||
" add $(16 * 18), %[dst] \n\t"
|
||||
|
||||
// Enter a 9x unrolled loop for rounds of 9 or more.
|
||||
"9: cmp $9, %[rounds] \n\t"
|
||||
" jl 6f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A(0)
|
||||
" " ROUND_3_B_LAST(3)
|
||||
" sub $9, %[rounds] \n\t"
|
||||
" add $(12 * 9), %[src] \n\t"
|
||||
" add $(16 * 9), %[dst] \n\t"
|
||||
|
||||
// Enter a 6x unrolled loop for rounds of 6 or more.
|
||||
"6: cmp $6, %[rounds] \n\t"
|
||||
" jl 55f \n\t"
|
||||
" " ROUND_3_INIT()
|
||||
" " ROUND_3_A_LAST(0)
|
||||
" sub $6, %[rounds] \n\t"
|
||||
" add $(12 * 6), %[src] \n\t"
|
||||
" add $(16 * 6), %[dst] \n\t"
|
||||
|
||||
// Dispatch the remaining rounds 0..5.
|
||||
"55: cmp $3, %[rounds] \n\t"
|
||||
" jg 45f \n\t"
|
||||
" je 3f \n\t"
|
||||
" cmp $1, %[rounds] \n\t"
|
||||
" jg 2f \n\t"
|
||||
" je 1f \n\t"
|
||||
" jmp 0f \n\t"
|
||||
|
||||
"45: cmp $4, %[rounds] \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
// Block of non-interlaced encoding rounds, which can each
|
||||
// individually be jumped to. Rounds fall through to the next.
|
||||
"5: " ROUND()
|
||||
"4: " ROUND()
|
||||
"3: " ROUND()
|
||||
"2: " ROUND()
|
||||
"1: " ROUND()
|
||||
"0: \n\t"
|
||||
|
||||
// Outputs (modified).
|
||||
: [rounds] "+r" (rounds),
|
||||
[loops] "+r" (loops),
|
||||
[src] "+r" (*s),
|
||||
[dst] "+r" (*o),
|
||||
[a] "=&x" (a),
|
||||
[b] "=&x" (b),
|
||||
[c] "=&x" (c),
|
||||
[d] "=&x" (d),
|
||||
[e] "=&x" (e),
|
||||
[f] "=&x" (f)
|
||||
|
||||
// Inputs (not modified).
|
||||
: [lut0] "x" (lut0),
|
||||
[lut1] "x" (lut1),
|
||||
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
|
||||
[msk1] "x" (_mm_set1_epi32(0x04000040)),
|
||||
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
|
||||
[msk3] "x" (_mm_set1_epi32(0x01000010)),
|
||||
[n51] "x" (_mm_set1_epi8(51)),
|
||||
[n25] "x" (_mm_set1_epi8(25))
|
||||
|
||||
// Clobbers.
|
||||
: "cc", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
static BASE64_FORCE_INLINE __m128i
|
||||
enc_reshuffle (__m128i in)
|
||||
{
|
||||
// Input, bytes MSB to LSB:
|
||||
// 0 0 0 0 l k j i h g f e d c b a
|
||||
|
||||
in = _mm_shuffle_epi8(in, _mm_set_epi8(
|
||||
10, 11, 9, 10,
|
||||
7, 8, 6, 7,
|
||||
4, 5, 3, 4,
|
||||
1, 2, 0, 1));
|
||||
// in, bytes MSB to LSB:
|
||||
// k l j k
|
||||
// h i g h
|
||||
// e f d e
|
||||
// b c a b
|
||||
|
||||
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
|
||||
// bits, upper case are most significant bits, lower case are least significant bits
|
||||
// 0000kkkk LL000000 JJJJJJ00 00000000
|
||||
// 0000hhhh II000000 GGGGGG00 00000000
|
||||
// 0000eeee FF000000 DDDDDD00 00000000
|
||||
// 0000bbbb CC000000 AAAAAA00 00000000
|
||||
|
||||
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
|
||||
// 00000000 00kkkkLL 00000000 00JJJJJJ
|
||||
// 00000000 00hhhhII 00000000 00GGGGGG
|
||||
// 00000000 00eeeeFF 00000000 00DDDDDD
|
||||
// 00000000 00bbbbCC 00000000 00AAAAAA
|
||||
|
||||
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
|
||||
// 00000000 00llllll 000000jj KKKK0000
|
||||
// 00000000 00iiiiii 000000gg HHHH0000
|
||||
// 00000000 00ffffff 000000dd EEEE0000
|
||||
// 00000000 00cccccc 000000aa BBBB0000
|
||||
|
||||
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
|
||||
// 00llllll 00000000 00jjKKKK 00000000
|
||||
// 00iiiiii 00000000 00ggHHHH 00000000
|
||||
// 00ffffff 00000000 00ddEEEE 00000000
|
||||
// 00cccccc 00000000 00aaBBBB 00000000
|
||||
|
||||
return _mm_or_si128(t1, t3);
|
||||
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
||||
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
||||
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
||||
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
static BASE64_FORCE_INLINE __m128i
|
||||
enc_translate (const __m128i in)
|
||||
{
|
||||
// A lookup table containing the absolute offsets for all ranges:
|
||||
const __m128i lut = _mm_setr_epi8(
|
||||
65, 71, -4, -4,
|
||||
-4, -4, -4, -4,
|
||||
-4, -4, -4, -4,
|
||||
-19, -16, 0, 0
|
||||
);
|
||||
|
||||
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
||||
// # From To Abs Index Characters
|
||||
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
||||
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
||||
// 3 [62] [43] -19 12 +
|
||||
// 4 [63] [47] -16 13 /
|
||||
|
||||
// Create LUT indices from the input. The index for range #0 is right,
|
||||
// others are 1 less than expected:
|
||||
__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
|
||||
|
||||
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
||||
__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
|
||||
|
||||
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
|
||||
// now correct:
|
||||
indices = _mm_sub_epi8(indices, mask);
|
||||
|
||||
// Add offsets to input values:
|
||||
return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue