feat: indie status page MVP -- FastAPI + SQLite

- 8 DB models (services, incidents, monitors, subscribers, etc.)
- Full CRUD API for services, incidents, monitors
- Public status page with live data
- Incident detail page with timeline
- API key authentication
- Uptime monitoring scheduler
- 13 tests passing
- TECHNICAL_DESIGN.md with full spec
This commit is contained in:
IndieStatusBot 2026-04-25 05:00:00 +00:00
commit 902133edd3
4655 changed files with 1342691 additions and 0 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,76 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
#else
#pragma GCC target("avx")
#endif
#include <immintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_AVX_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_AVX_USE_ASM 1
# else
# define BASE64_AVX_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#if BASE64_AVX_USE_ASM
# include "./enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_AVX
void
base64_stream_encode_avx BASE64_ENC_PARAMS
{
#if HAVE_AVX
#include "../generic/enc_head.c"
// For supported compilers, use a hand-optimized inline assembly
// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
// AVX flags to generate better optimized AVX code.
#if BASE64_AVX_USE_ASM
enc_loop_avx(&s, &slen, &o, &olen);
#else
enc_loop_ssse3(&s, &slen, &o, &olen);
#endif
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_avx BASE64_DEC_PARAMS
{
#if HAVE_AVX
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#if defined(__clang__)
#pragma clang attribute pop
#endif
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,264 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round.
#define LOAD(R0, ROUND) \
"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1, R2) \
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
"vpor %["R1"], %["R2"], %["R1"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0) \
SHUF("a", "b", "c") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $12, %[src] \n\t" \
"add $16, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0) /* + */ \
SHUF("a", "d", "e") /* | + x */ \
LOAD("b", 1) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3)) /* V V V + */ \
SHUF(B, E, F) /* | | | | + x */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4)) /* | | | + */ \
SHUF(C, A, F) /* + | | | | x */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5)) /* | | | + */ \
SHUF(D, A, B) /* + x | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A, B) /* + x V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C, D) /* + x | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
// bytes, so "reserve" four bytes from the input buffer to ensure that
// we never read beyond the end of the input buffer.
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m128i lut0 = _mm_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
const __m128i lut1 = _mm_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m128i a, b, c, d, e, f;
__asm__ volatile (
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(12 * 36), %[src] \n\t"
" add $(16 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(12 * 18), %[src] \n\t"
" add $(16 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(12 * 9), %[src] \n\t"
" add $(16 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(12 * 6), %[src] \n\t"
" add $(16 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "=&x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm_set1_epi32(0x04000040)),
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm_set1_epi32(0x01000010)),
[n51] "x" (_mm_set1_epi8(51)),
[n25] "x" (_mm_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View file

@ -0,0 +1,66 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX2
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
#else
#pragma GCC target("avx2")
#endif
#include <immintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_AVX2_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_AVX2_USE_ASM 1
# else
# define BASE64_AVX2_USE_ASM 0
# endif
#endif
#include "./dec_reshuffle.c"
#include "./dec_loop.c"
#if BASE64_AVX2_USE_ASM
# include "./enc_loop_asm.c"
#else
# include "./enc_translate.c"
# include "./enc_reshuffle.c"
# include "./enc_loop.c"
#endif
#endif // HAVE_AVX2
void
base64_stream_encode_avx2 BASE64_ENC_PARAMS
{
#if HAVE_AVX2
#include "../generic/enc_head.c"
enc_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_avx2 BASE64_DEC_PARAMS
{
#if HAVE_AVX2
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#if defined(__clang__)
#pragma clang attribute pop
#endif
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,110 @@
static BASE64_FORCE_INLINE int
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
const __m256i lut_hi = _mm256_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
const __m256i lut_roll = _mm256_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0,
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0);
const __m256i mask_2F = _mm256_set1_epi8(0x2F);
// Load input:
__m256i str = _mm256_loadu_si256((__m256i *) *s);
// See the SSSE3 decoder for an explanation of the algorithm.
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
if (!_mm256_testz_si256(lo, hi)) {
return 0;
}
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
// Store the output:
_mm256_storeu_si256((__m256i *) *o, str);
*s += 32;
*o += 24;
*rounds -= 1;
return 1;
}
static inline void
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 45) {
return;
}
// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
// written after the output, ensure that there will be at least 13
// bytes of input data left to cover the gap. (11 data bytes and up to
// two end-of-string markers.)
size_t rounds = (*slen - 13) / 32;
*slen -= rounds * 32; // 32 bytes consumed per round
*olen += rounds * 24; // 24 bytes produced per round
do {
if (rounds >= 8) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_avx2_inner(s, o, &rounds);
break;
} while (rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 32;
*olen -= rounds * 24;
}

View file

@ -0,0 +1,34 @@
static BASE64_FORCE_INLINE __m256i
dec_reshuffle (const __m256i in)
{
// in, lower lane, bits, upper case are most significant bits, lower
// case are least significant bits:
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
// Pack bytes together in each lane:
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
// Pack lanes:
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
}

View file

@ -0,0 +1,89 @@
static BASE64_FORCE_INLINE void
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
{
// First load is done at s - 0 to not get a segfault:
__m256i src = _mm256_loadu_si256((__m256i *) *s);
// Shift by 4 bytes, as required by enc_reshuffle:
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);
// Subsequent loads will be done at s - 4, set pointer for next round:
*s += 20;
*o += 32;
}
static BASE64_FORCE_INLINE void
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
__m256i src = _mm256_loadu_si256((__m256i *) *s);
// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);
*s += 24;
*o += 32;
}
static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 32) {
return;
}
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
// bytes at a time an offset of -4, ensure that there will be at least
// 4 remaining bytes after the last round, so that the final read will
// not pass beyond the bounds of the input buffer:
size_t rounds = (*slen - 4) / 24;
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round
// The first loop iteration requires special handling to ensure that
// the read, which is done at an offset, does not underflow the buffer:
enc_loop_avx2_inner_first(s, o);
rounds--;
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx2_inner(s, o);
break;
}
// Add the offset back:
*s += 4;
}

View file

@ -0,0 +1,291 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round and a
// constant offset.
#define LOAD(R0, ROUND, OFFSET) \
"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1, R2) \
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
"vpor %["R1"], %["R2"], %["R1"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0, -4) \
SHUF("a", "b", "c") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $24, %[src] \n\t" \
"add $32, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0, -4) /* + */ \
SHUF("a", "d", "e") /* | + x */ \
LOAD("b", 1, -4) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2, -4) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3), -4) /* V V V + */ \
SHUF(B, E, F) /* | | | | + x */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4), -4) /* | | | + */ \
SHUF(C, A, F) /* + | | | | x */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5), -4) /* | | | + */ \
SHUF(D, A, B) /* + x | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A, B) /* + x V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C, D) /* + x | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 32) {
return;
}
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
// bytes at a time an offset of -4, ensure that there will be at least
// 4 remaining bytes after the last round, so that the final read will
// not pass beyond the bounds of the input buffer.
size_t rounds = (*slen - 4) / 24;
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round
// Pre-decrement the number of rounds to get the number of rounds
// *after* the first round, which is handled as a special case.
rounds--;
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m256i lut0 = _mm256_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5);
const __m256i lut1 = _mm256_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m256i a, b, c, d, e;
// Temporary register f doubles as the shift mask for the first round.
__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
__asm__ volatile (
// The first loop iteration requires special handling to ensure
// that the read, which is normally done at an offset of -4,
// does not underflow the buffer. Load the buffer at an offset
// of 0 and permute the input to achieve the same effect.
LOAD("a", 0, 0)
"vpermd %[a], %[f], %[a] \n\t"
// Perform the standard shuffling and translation steps.
SHUF("a", "b", "c")
TRAN("a", "b", "c")
// Store the result and increment the source and dest pointers.
"vmovdqu %[a], (%[dst]) \n\t"
"add $24, %[src] \n\t"
"add $32, %[dst] \n\t"
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(24 * 36), %[src] \n\t"
" add $(32 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(24 * 18), %[src] \n\t"
" add $(32 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(24 * 9), %[src] \n\t"
" add $(32 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(24 * 6), %[src] \n\t"
" add $(32 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "+x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm256_set1_epi32(0x04000040)),
[msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm256_set1_epi32(0x01000010)),
[n51] "x" (_mm256_set1_epi8(51)),
[n25] "x" (_mm256_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View file

@ -0,0 +1,83 @@
static BASE64_FORCE_INLINE __m256i
enc_reshuffle (const __m256i input)
{
// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
// works with shifted (4 bytes) input in order to be able to work
// efficiently in the two 128-bit lanes.
// Input, bytes MSB to LSB:
// 0 0 0 0 x w v u t s r q p o n m
// l k j i h g f e d c b a 0 0 0 0
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1,
14, 15, 13, 14,
11, 12, 10, 11,
8, 9, 7, 8,
5, 6, 4, 5));
// in, bytes MSB to LSB:
// w x v w
// t u s t
// q r p q
// n o m n
// k l j k
// h i g h
// e f d e
// b c a b
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
// bits, upper case are most significant bits, lower case are least
// significant bits.
// 0000wwww XX000000 VVVVVV00 00000000
// 0000tttt UU000000 SSSSSS00 00000000
// 0000qqqq RR000000 PPPPPP00 00000000
// 0000nnnn OO000000 MMMMMM00 00000000
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
// 00000000 00wwwwXX 00000000 00VVVVVV
// 00000000 00ttttUU 00000000 00SSSSSS
// 00000000 00qqqqRR 00000000 00PPPPPP
// 00000000 00nnnnOO 00000000 00MMMMMM
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
// 00000000 00xxxxxx 000000vv WWWW0000
// 00000000 00uuuuuu 000000ss TTTT0000
// 00000000 00rrrrrr 000000pp QQQQ0000
// 00000000 00oooooo 000000mm NNNN0000
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
// 00xxxxxx 00000000 00vvWWWW 00000000
// 00uuuuuu 00000000 00ssTTTT 00000000
// 00rrrrrr 00000000 00ppQQQQ 00000000
// 00oooooo 00000000 00mmNNNN 00000000
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
return _mm256_or_si256(t1, t3);
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

View file

@ -0,0 +1,30 @@
static BASE64_FORCE_INLINE __m256i
enc_translate (const __m256i in)
{
// A lookup table containing the absolute offsets for all ranges:
const __m256i lut = _mm256_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from the input. The index for range #0 is right,
// others are 1 less than expected:
__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
// now correct:
indices = _mm256_sub_epi8(indices, mask);
// Add offsets to input values:
return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
}

View file

@ -0,0 +1,52 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX512
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("avx512vbmi"))), apply_to=function)
#else
#pragma GCC target("avx512vbmi")
#endif
#include <immintrin.h>
#include "../avx2/dec_reshuffle.c"
#include "../avx2/dec_loop.c"
#include "enc_reshuffle_translate.c"
#include "enc_loop.c"
#endif // HAVE_AVX512
void
base64_stream_encode_avx512 BASE64_ENC_PARAMS
{
#if HAVE_AVX512
#include "../generic/enc_head.c"
enc_loop_avx512(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
// Reuse AVX2 decoding. Not supporting AVX512 at present
int
base64_stream_decode_avx512 BASE64_DEC_PARAMS
{
#if HAVE_AVX512
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#if defined(__clang__)
#pragma clang attribute pop
#endif
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,61 @@
static BASE64_FORCE_INLINE void
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
{
// Load input.
__m512i src = _mm512_loadu_si512((__m512i *) *s);
// Reshuffle, translate, store.
src = enc_reshuffle_translate(src);
_mm512_storeu_si512((__m512i *) *o, src);
*s += 48;
*o += 64;
}
static inline void
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 48 bytes at a time. Because blocks are loaded 64
// bytes at a time, ensure that there will be at least 24 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer.
size_t rounds = (*slen - 24) / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx512_inner(s, o);
break;
}
}

View file

@ -0,0 +1,50 @@
// AVX512 algorithm is based on permutevar and multishift. The code is based on
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
static BASE64_FORCE_INLINE __m512i
enc_reshuffle_translate (const __m512i input)
{
// 32-bit input
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
// output order [1, 2, 0, 1]
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
0x04050304,
0x07080607,
0x0a0b090a,
0x0d0e0c0d,
0x10110f10,
0x13141213,
0x16171516,
0x191a1819,
0x1c1d1b1c,
0x1f201e1f,
0x22232122,
0x25262425,
0x28292728,
0x2b2c2a2b,
0x2e2f2d2e);
// Reorder bytes
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
// After multishift a single 32-bit lane has following layout
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
// 48, 54, 36, 42, 16, 22, 4, 10
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
// Translate immediately after reshuffled.
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
// Translation 6-bit values to ASCII.
return _mm512_permutexvar_epi8(shuffled_in, lookup);
}

View file

@ -0,0 +1,86 @@
static BASE64_FORCE_INLINE int
dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const uint32_t str
= base64_table_dec_32bit_d0[(*s)[0]]
| base64_table_dec_32bit_d1[(*s)[1]]
| base64_table_dec_32bit_d2[(*s)[2]]
| base64_table_dec_32bit_d3[(*s)[3]];
#if BASE64_LITTLE_ENDIAN
// LUTs for little-endian set MSB in case of invalid character:
if (str & UINT32_C(0x80000000)) {
return 0;
}
#else
// LUTs for big-endian set LSB in case of invalid character:
if (str & UINT32_C(1)) {
return 0;
}
#endif
// Store the output:
memcpy(*o, &str, sizeof (str));
*s += 4;
*o += 3;
*rounds -= 1;
return 1;
}
static inline void
dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 8) {
return;
}
// Process blocks of 4 bytes per round. Because one extra zero byte is
// written after the output, ensure that there will be at least 4 bytes
// of input data left to cover the gap. (Two data bytes and up to two
// end-of-string markers.)
size_t rounds = (*slen - 4) / 4;
*slen -= rounds * 4; // 4 bytes consumed per round
*olen += rounds * 3; // 3 bytes produced per round
do {
if (rounds >= 8) {
if (dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_generic_32_inner(s, o, &rounds);
break;
} while (rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 4;
*olen -= rounds * 3;
}

View file

@ -0,0 +1,73 @@
static BASE64_FORCE_INLINE void
enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
{
uint32_t src;
// Load input:
memcpy(&src, *s, sizeof (src));
// Reorder to 32-bit big-endian, if not already in that format. The
// workset must be in big-endian, otherwise the shifted bits do not
// carry over properly among adjacent bytes:
src = BASE64_HTOBE32(src);
// Two indices for the 12-bit lookup table:
const size_t index0 = (src >> 20) & 0xFFFU;
const size_t index1 = (src >> 8) & 0xFFFU;
// Table lookup and store:
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
*s += 3;
*o += 4;
}
static inline void
enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 4) {
return;
}
// Process blocks of 3 bytes at a time. Because blocks are loaded 4
// bytes at a time, ensure that there will be at least one remaining
// byte after the last round, so that the final read will not pass
// beyond the bounds of the input buffer:
size_t rounds = (*slen - 1) / 3;
*slen -= rounds * 3; // 3 bytes consumed per round
*olen += rounds * 4; // 4 bytes produced per round
do {
if (rounds >= 8) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_generic_32_inner(s, o);
break;
} while (rounds > 0);
}

View file

@ -0,0 +1,77 @@
static BASE64_FORCE_INLINE void
enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
{
uint64_t src;
// Load input:
memcpy(&src, *s, sizeof (src));
// Reorder to 64-bit big-endian, if not already in that format. The
// workset must be in big-endian, otherwise the shifted bits do not
// carry over properly among adjacent bytes:
src = BASE64_HTOBE64(src);
// Four indices for the 12-bit lookup table:
const size_t index0 = (src >> 52) & 0xFFFU;
const size_t index1 = (src >> 40) & 0xFFFU;
const size_t index2 = (src >> 28) & 0xFFFU;
const size_t index3 = (src >> 16) & 0xFFFU;
// Table lookup and store:
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
*s += 6;
*o += 8;
}
static inline void
enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 8) {
return;
}
// Process blocks of 6 bytes at a time. Because blocks are loaded 8
// bytes at a time, ensure that there will be at least 2 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer:
size_t rounds = (*slen - 2) / 6;
*slen -= rounds * 6; // 6 bytes consumed per round
*olen += rounds * 8; // 8 bytes produced per round
do {
if (rounds >= 8) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_generic_64_inner(s, o);
break;
} while (rounds > 0);
}

View file

@ -0,0 +1,41 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if BASE64_WORDSIZE == 32
# include "32/enc_loop.c"
#elif BASE64_WORDSIZE == 64
# include "64/enc_loop.c"
#endif
#if BASE64_WORDSIZE >= 32
# include "32/dec_loop.c"
#endif
void
base64_stream_encode_plain BASE64_ENC_PARAMS
{
#include "enc_head.c"
#if BASE64_WORDSIZE == 32
enc_loop_generic_32(&s, &slen, &o, &olen);
#elif BASE64_WORDSIZE == 64
enc_loop_generic_64(&s, &slen, &o, &olen);
#endif
#include "enc_tail.c"
}
int
base64_stream_decode_plain BASE64_DEC_PARAMS
{
#include "dec_head.c"
#if BASE64_WORDSIZE >= 32
dec_loop_generic_32(&s, &slen, &o, &olen);
#endif
#include "dec_tail.c"
}

View file

@ -0,0 +1,37 @@
int ret = 0;
const uint8_t *s = (const uint8_t *) src;
uint8_t *o = (uint8_t *) out;
uint8_t q;
// Use local temporaries to avoid cache thrashing:
size_t olen = 0;
size_t slen = srclen;
struct base64_state st;
st.eof = state->eof;
st.bytes = state->bytes;
st.carry = state->carry;
// If we previously saw an EOF or an invalid character, bail out:
if (st.eof) {
*outlen = 0;
ret = 0;
// If there was a trailing '=' to check, check it:
if (slen && (st.eof == BASE64_AEOF)) {
state->bytes = 0;
state->eof = BASE64_EOF;
ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
}
return ret;
}
// Turn four 6-bit numbers into three bytes:
// out[0] = 11111122
// out[1] = 22223333
// out[2] = 33444444
// Duff's device again:
switch (st.bytes)
{
for (;;)
{
case 0:

View file

@ -0,0 +1,91 @@
if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 0:
break;
}
st.carry = q << 2;
st.bytes++;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 1: if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 1:
break;
}
*o++ = st.carry | (q >> 4);
st.carry = q << 4;
st.bytes++;
olen++;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 2: if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.bytes++;
// When q == 254, the input char is '='.
// Check if next byte is also '=':
if (q == 254) {
if (slen-- != 0) {
st.bytes = 0;
// EOF:
st.eof = BASE64_EOF;
q = base64_table_dec_8bit[*s++];
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
break;
}
else {
// Almost EOF
st.eof = BASE64_AEOF;
ret = 1;
break;
}
}
// If we get here, there was an error:
break;
}
*o++ = st.carry | (q >> 2);
st.carry = q << 6;
st.bytes++;
olen++;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 3: if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.bytes = 0;
st.eof = BASE64_EOF;
// When q == 254, the input char is '='. Return 1 and EOF.
// When q == 255, the input char is invalid. Return 0 and EOF.
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
break;
}
*o++ = st.carry | q;
st.carry = 0;
st.bytes = 0;
olen++;
}
}
state->eof = st.eof;
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = olen;
return ret;

View file

@ -0,0 +1,24 @@
// Assume that *out is large enough to contain the output.
// Theoretically it should be 4/3 the length of src.
const uint8_t *s = (const uint8_t *) src;
uint8_t *o = (uint8_t *) out;
// Use local temporaries to avoid cache thrashing:
size_t olen = 0;
size_t slen = srclen;
struct base64_state st;
st.bytes = state->bytes;
st.carry = state->carry;
// Turn three bytes into four 6-bit numbers:
// in[0] = 00111111
// in[1] = 00112222
// in[2] = 00222233
// in[3] = 00333333
// Duff's device, a for() loop inside a switch() statement. Legal!
switch (st.bytes)
{
for (;;)
{
case 0:

View file

@ -0,0 +1,34 @@
if (slen-- == 0) {
break;
}
*o++ = base64_table_enc_6bit[*s >> 2];
st.carry = (*s++ << 4) & 0x30;
st.bytes++;
olen += 1;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 1: if (slen-- == 0) {
break;
}
*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
st.carry = (*s++ << 2) & 0x3C;
st.bytes++;
olen += 1;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 2: if (slen-- == 0) {
break;
}
*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
*o++ = base64_table_enc_6bit[*s++ & 0x3F];
st.bytes = 0;
olen += 2;
}
}
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = olen;

View file

@ -0,0 +1,79 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#ifdef __arm__
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
# define BASE64_USE_NEON32
# endif
#endif
#ifdef BASE64_USE_NEON32
#include <arm_neon.h>
// Only enable inline assembly on supported compilers.
#if defined(__GNUC__) || defined(__clang__)
#define BASE64_NEON32_USE_ASM
#endif
static BASE64_FORCE_INLINE uint8x16_t
vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
{
// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
uint8x8x2_t lut2;
uint8x8x2_t result;
lut2.val[0] = vget_low_u8(lut);
lut2.val[1] = vget_high_u8(lut);
result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
return vcombine_u8(result.val[0], result.val[1]);
}
#include "../generic/32/dec_loop.c"
#include "../generic/32/enc_loop.c"
#include "dec_loop.c"
#include "enc_reshuffle.c"
#include "enc_translate.c"
#include "enc_loop.c"
#endif // BASE64_USE_NEON32
// Stride size is so large on these NEON 32-bit functions
// (48 bytes encode, 32 bytes decode) that we inline the
// uint32 codec to stay performant on smaller inputs.
void
base64_stream_encode_neon32 BASE64_ENC_PARAMS
{
#ifdef BASE64_USE_NEON32
#include "../generic/enc_head.c"
enc_loop_neon32(&s, &slen, &o, &olen);
enc_loop_generic_32(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_neon32 BASE64_DEC_PARAMS
{
#ifdef BASE64_USE_NEON32
#include "../generic/dec_head.c"
dec_loop_neon32(&s, &slen, &o, &olen);
dec_loop_generic_32(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,106 @@
static BASE64_FORCE_INLINE int
is_nonzero (const uint8x16_t v)
{
uint64_t u64;
const uint64x2_t v64 = vreinterpretq_u64_u8(v);
const uint32x2_t v32 = vqmovn_u64(v64);
vst1_u64(&u64, vreinterpret_u64_u32(v32));
return u64 != 0;
}
static BASE64_FORCE_INLINE uint8x16_t
delta_lookup (const uint8x16_t v)
{
const uint8x8_t lut = {
0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
};
return vcombine_u8(
vtbl1_u8(lut, vget_low_u8(v)),
vtbl1_u8(lut, vget_high_u8(v)));
}
static BASE64_FORCE_INLINE uint8x16_t
dec_loop_neon32_lane (uint8x16_t *lane)
{
// See the SSSE3 decoder for an explanation of the algorithm.
const uint8x16_t lut_lo = {
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
};
const uint8x16_t lut_hi = {
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
};
const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
// Now simply add the delta values to the input:
*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
// Return the validity mask:
return vandq_u8(lo, hi);
}
static inline void
dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
// extra trailing zero bytes are written, so it is not necessary to
// reserve extra input bytes:
size_t rounds = *slen / 64;
*slen -= rounds * 64; // 64 bytes consumed per round
*olen += rounds * 48; // 48 bytes produced per round
do {
uint8x16x3_t dec;
// Load 64 bytes and deinterleave:
uint8x16x4_t str = vld4q_u8(*s);
// Decode each lane, collect a mask of invalid inputs:
const uint8x16_t classified
= dec_loop_neon32_lane(&str.val[0])
| dec_loop_neon32_lane(&str.val[1])
| dec_loop_neon32_lane(&str.val[2])
| dec_loop_neon32_lane(&str.val[3]);
// Check for invalid input: if any of the delta values are
// zero, fall back on bytewise code to do error checking and
// reporting:
if (is_nonzero(classified)) {
break;
}
// Compress four bytes into three:
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
// Interleave and store decoded result:
vst3q_u8(*o, dec);
*s += 64;
*o += 48;
} while (--rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 64;
*olen -= rounds * 48;
}

View file

@ -0,0 +1,170 @@
#ifdef BASE64_NEON32_USE_ASM
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
{
// This function duplicates the functionality of enc_loop_neon32_inner,
// but entirely with inline assembly. This gives a significant speedup
// over using NEON intrinsics, which do not always generate very good
// code. The logic of the assembly is directly lifted from the
// intrinsics version, so it can be used as a guide to this code.
// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;
uint8x16_t mask0, mask1, mask2, mask3;
// A lookup table containing the absolute offsets for all ranges.
const uint8x16_t lut = {
65U, 71U, 252U, 252U,
252U, 252U, 252U, 252U,
252U, 252U, 252U, 252U,
237U, 240U, 0U, 0U
};
// Numeric constants.
const uint8x16_t n51 = vdupq_n_u8(51);
const uint8x16_t n25 = vdupq_n_u8(25);
const uint8x16_t n63 = vdupq_n_u8(63);
__asm__ (
// Load 48 bytes and deinterleave. The bytes are loaded to
// hard-coded registers q12, q13 and q14, to ensure that they
// are contiguous. Increment the source pointer.
"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
// Reshuffle the bytes using temporaries.
"vshr.u8 %q[t0], q12, #2 \n\t"
"vshr.u8 %q[t1], q13, #4 \n\t"
"vshr.u8 %q[t2], q14, #6 \n\t"
"vsli.8 %q[t1], q12, #4 \n\t"
"vsli.8 %q[t2], q13, #2 \n\t"
"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
"vand.u8 %q[t3], q14, %q[n63] \n\t"
// t0..t3 are the reshuffled inputs. Create LUT indices.
"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
"vqsub.u8 q15, %q[t3], %q[n51] \n\t"
// Create the mask for range #0.
"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
// Subtract -1 to correct the LUT indices.
"vsub.u8 q12, %q[m0] \n\t"
"vsub.u8 q13, %q[m1] \n\t"
"vsub.u8 q14, %q[m2] \n\t"
"vsub.u8 q15, %q[m3] \n\t"
// Lookup the delta values.
"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
"vtbl.u8 d31, {%q[lut]}, d31 \n\t"
// Add the delta values.
"vadd.u8 q12, %q[t0] \n\t"
"vadd.u8 q13, %q[t1] \n\t"
"vadd.u8 q14, %q[t2] \n\t"
"vadd.u8 q15, %q[t3] \n\t"
// Store 64 bytes and interleave. Increment the dest pointer.
"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
// Outputs (modified).
: [src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3),
[m0] "=&w" (mask0),
[m1] "=&w" (mask1),
[m2] "=&w" (mask2),
[m3] "=&w" (mask3)
// Inputs (not modified).
: [lut] "w" (lut),
[n25] "w" (n25),
[n51] "w" (n51),
[n63] "w" (n63)
// Clobbers.
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
"cc", "memory"
);
}
#endif
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
{
#ifdef BASE64_NEON32_USE_ASM
enc_loop_neon32_inner_asm(s, o);
#else
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);
// Reshuffle:
uint8x16x4_t out = enc_reshuffle(src);
// Translate reshuffled bytes to the Base64 alphabet:
out = enc_translate(out);
// Interleave and store output:
vst4q_u8(*o, out);
*s += 48;
*o += 64;
#endif
}
static inline void
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_neon32_inner(s, o);
break;
}
}

View file

@ -0,0 +1,31 @@
static BASE64_FORCE_INLINE uint8x16x4_t
enc_reshuffle (uint8x16x3_t in)
{
uint8x16x4_t out;
// Input:
// in[0] = a7 a6 a5 a4 a3 a2 a1 a0
// in[1] = b7 b6 b5 b4 b3 b2 b1 b0
// in[2] = c7 c6 c5 c4 c3 c2 c1 c0
// Output:
// out[0] = 00 00 a7 a6 a5 a4 a3 a2
// out[1] = 00 00 a1 a0 b7 b6 b5 b4
// out[2] = 00 00 b3 b2 b1 b0 c7 c6
// out[3] = 00 00 c5 c4 c3 c2 c1 c0
// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
out.val[0] = vshrq_n_u8(in.val[0], 2);
out.val[1] = vshrq_n_u8(in.val[1], 4);
out.val[2] = vshrq_n_u8(in.val[2], 6);
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
// Clear the high two bits in the second, third and fourth output.
out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
return out;
}

View file

@ -0,0 +1,57 @@
static BASE64_FORCE_INLINE uint8x16x4_t
enc_translate (const uint8x16x4_t in)
{
// A lookup table containing the absolute offsets for all ranges:
const uint8x16_t lut = {
65U, 71U, 252U, 252U,
252U, 252U, 252U, 252U,
252U, 252U, 252U, 252U,
237U, 240U, 0U, 0U
};
const uint8x16_t offset = vdupq_n_u8(51);
uint8x16x4_t indices, mask, delta, out;
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from input:
// the index for range #0 is right, others are 1 less than expected:
indices.val[0] = vqsubq_u8(in.val[0], offset);
indices.val[1] = vqsubq_u8(in.val[1], offset);
indices.val[2] = vqsubq_u8(in.val[2], offset);
indices.val[3] = vqsubq_u8(in.val[3], offset);
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
// Subtract -1, so add 1 to indices for range #[1..4], All indices are
// now correct:
indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
// Lookup delta values:
delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
// Add delta values:
out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
return out;
}

View file

@ -0,0 +1,93 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_NEON64
#include <arm_neon.h>
// Only enable inline assembly on supported compilers.
#if !defined(__wasm__) && (defined(__GNUC__) || defined(__clang__))
#define BASE64_NEON64_USE_ASM
#endif
static BASE64_FORCE_INLINE uint8x16x4_t
load_64byte_table (const uint8_t *p)
{
#ifdef BASE64_NEON64_USE_ASM
// Force the table to be loaded into contiguous registers. GCC will not
// normally allocate contiguous registers for a `uint8x16x4_t'. These
// registers are chosen to not conflict with the ones in the enc loop.
register uint8x16_t t0 __asm__ ("v8");
register uint8x16_t t1 __asm__ ("v9");
register uint8x16_t t2 __asm__ ("v10");
register uint8x16_t t3 __asm__ ("v11");
__asm__ (
"ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
: [src] "+r" (p),
[t0] "=w" (t0),
[t1] "=w" (t1),
[t2] "=w" (t2),
[t3] "=w" (t3)
);
return (uint8x16x4_t) {
.val[0] = t0,
.val[1] = t1,
.val[2] = t2,
.val[3] = t3,
};
#else
return vld1q_u8_x4(p);
#endif
}
#include "../generic/32/dec_loop.c"
#include "../generic/64/enc_loop.c"
#include "dec_loop.c"
#ifdef BASE64_NEON64_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_reshuffle.c"
# include "enc_loop.c"
#endif
#endif // HAVE_NEON64
// Stride size is so large on these NEON 64-bit functions
// (48 bytes encode, 64 bytes decode) that we inline the
// uint64 codec to stay performant on smaller inputs.
void
base64_stream_encode_neon64 BASE64_ENC_PARAMS
{
#if HAVE_NEON64
#include "../generic/enc_head.c"
enc_loop_neon64(&s, &slen, &o, &olen);
enc_loop_generic_64(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_neon64 BASE64_DEC_PARAMS
{
#if HAVE_NEON64
#include "../generic/dec_head.c"
dec_loop_neon64(&s, &slen, &o, &olen);
dec_loop_generic_32(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,129 @@
// The input consists of five valid character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To LUT Characters
// 1 [0..42] [255] #1 invalid input
// 2 [43] [62] #1 +
// 3 [44..46] [255] #1 invalid input
// 4 [47] [63] #1 /
// 5 [48..57] [52..61] #1 0..9
// 6 [58..63] [255] #1 invalid input
// 7 [64] [255] #2 invalid input
// 8 [65..90] [0..25] #2 A..Z
// 9 [91..96] [255] #2 invalid input
// 10 [97..122] [26..51] #2 a..z
// 11 [123..126] [255] #2 invalid input
// (12) Everything else => invalid input
// The first LUT will use the VTBL instruction (out of range indices are set to
// 0 in destination).
static const uint8_t dec_lut1[] = {
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
};
// The second LUT will use the VTBX instruction (out of range indices will be
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
// in this LUT. Index 0 means that value comes from LUT #1.
static const uint8_t dec_lut2[] = {
0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
};
// All input values in range for the first look-up will be 0U in the second
// look-up result. All input values out of range for the first look-up will be
// 0U in the first look-up result. Thus, the two results can be ORed without
// conflicts.
//
// Invalid characters that are in the valid range for either look-up will be
// set to 255U in the combined result. Other invalid characters will just be
// passed through with the second look-up result (using the VTBX instruction).
// Since the second LUT is 64 bytes, those passed-through values are guaranteed
// to have a value greater than 63U. Therefore, valid characters will be mapped
// to the valid [0..63] range and all invalid characters will be mapped to
// values greater than 63.
static inline void
dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
// extra trailing zero bytes are written, so it is not necessary to
// reserve extra input bytes:
size_t rounds = *slen / 64;
*slen -= rounds * 64; // 64 bytes consumed per round
*olen += rounds * 48; // 48 bytes produced per round
const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
do {
const uint8x16_t offset = vdupq_n_u8(63U);
uint8x16x4_t dec1, dec2;
uint8x16x3_t dec;
// Load 64 bytes and deinterleave:
uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
// Get indices for second LUT:
dec2.val[0] = vqsubq_u8(str.val[0], offset);
dec2.val[1] = vqsubq_u8(str.val[1], offset);
dec2.val[2] = vqsubq_u8(str.val[2], offset);
dec2.val[3] = vqsubq_u8(str.val[3], offset);
// Get values from first LUT:
dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
// Get values from second LUT:
dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
// Get final values:
str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
// Check for invalid input, any value larger than 63:
const uint8x16_t classified
= vorrq_u8(
vorrq_u8(vcgtq_u8(str.val[0], vdupq_n_u8(63)), vcgtq_u8(str.val[1], vdupq_n_u8(63))),
vorrq_u8(vcgtq_u8(str.val[2], vdupq_n_u8(63)), vcgtq_u8(str.val[3], vdupq_n_u8(63)))
);
// Check that all bits are zero:
if (vmaxvq_u8(classified) != 0U) {
break;
}
// Compress four bytes into three:
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
// Interleave and store decoded result:
vst3q_u8((uint8_t *) *o, dec);
*s += 64;
*o += 48;
} while (--rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 64;
*olen -= rounds * 48;
}

View file

@ -0,0 +1,66 @@
static BASE64_FORCE_INLINE void
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);
// Divide bits of three input bytes over four output bytes:
uint8x16x4_t out = enc_reshuffle(src);
// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
// Interleave and store output:
vst4q_u8(*o, out);
*s += 48;
*o += 64;
}
static inline void
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
// Load the encoding table:
const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
rounds -= 2;
continue;
}
enc_loop_neon64_inner(s, o, tbl_enc);
break;
}
}

View file

@ -0,0 +1,168 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads three user-defined registers
// A, B, C from memory and deinterleaves them, post-incrementing the src
// pointer. The register set should be sequential.
#define LOAD(A, B, C) \
"ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
// Generate a block of inline assembly that takes three deinterleaved registers
// and shuffles the bytes. The output is in temporary registers t0..t3.
#define SHUF(A, B, C) \
"ushr %[t0].16b, "A".16b, #2 \n\t" \
"ushr %[t1].16b, "B".16b, #4 \n\t" \
"ushr %[t2].16b, "C".16b, #6 \n\t" \
"sli %[t1].16b, "A".16b, #4 \n\t" \
"sli %[t2].16b, "B".16b, #2 \n\t" \
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
"and %[t3].16b, "C".16b, %[n63].16b \n\t"
// Generate a block of inline assembly that takes temporary registers t0..t3
// and translates them to the base64 alphabet, using a table loaded into
// v8..v11. The output is in user-defined registers A..D.
#define TRAN(A, B, C, D) \
"tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
"tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
"tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
"tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
// Generate a block of inline assembly that interleaves four registers and
// stores them, post-incrementing the destination pointer.
#define STOR(A, B, C, D) \
"st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result.
#define ROUND() \
LOAD("v12", "v13", "v14") \
SHUF("v12", "v13", "v14") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")
// Generate a block of assembly that generates a type A interleaved encoder
// round. It uses registers that were loaded by the previous type B round, and
// in turn loads registers for the next type B round.
#define ROUND_A() \
SHUF("v2", "v3", "v4") \
LOAD("v12", "v13", "v14") \
TRAN("v2", "v3", "v4", "v5") \
STOR("v2", "v3", "v4", "v5")
// Type B interleaved encoder round. Same as type A, but register sets swapped.
#define ROUND_B() \
SHUF("v12", "v13", "v14") \
LOAD("v2", "v3", "v4") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")
// The first type A round needs to load its own registers.
#define ROUND_A_FIRST() \
LOAD("v2", "v3", "v4") \
ROUND_A()
// The last type B round omits the load for the next step.
#define ROUND_B_LAST() \
SHUF("v12", "v13", "v14") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
if (rounds == 0) {
return;
}
*slen -= rounds * 48; // 48 bytes consumed per round.
*olen += rounds * 64; // 64 bytes produced per round.
// Number of times to go through the 8x loop.
size_t loops = rounds / 8;
// Number of rounds remaining after the 8x loop.
rounds %= 8;
// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;
__asm__ volatile (
// Load the encoding table into v8..v11.
" ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
// If there are eight rounds or more, enter an 8x unrolled loop
// of interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations to maximize
// pipeline throughput.
" cbz %[loops], 4f \n\t"
// The SIMD instructions do not touch the flags.
"88: subs %[loops], %[loops], #1 \n\t"
" " ROUND_A_FIRST()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B_LAST()
" b.ne 88b \n\t"
// Enter a 4x unrolled loop for rounds of 4 or more.
"4: cmp %[rounds], #4 \n\t"
" b.lt 30f \n\t"
" " ROUND_A_FIRST()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B_LAST()
" sub %[rounds], %[rounds], #4 \n\t"
// Dispatch the remaining rounds 0..3.
"30: cbz %[rounds], 0f \n\t"
" cmp %[rounds], #2 \n\t"
" b.eq 2f \n\t"
" b.lt 1f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3)
// Inputs (not modified).
: [rounds] "r" (rounds),
[tbl] "r" (base64_table_enc_6bit),
[n63] "w" (vdupq_n_u8(63))
// Clobbers.
: "v2", "v3", "v4", "v5",
"v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15",
"cc", "memory"
);
}
#pragma GCC diagnostic pop

View file

@ -0,0 +1,31 @@
static BASE64_FORCE_INLINE uint8x16x4_t
enc_reshuffle (const uint8x16x3_t in)
{
uint8x16x4_t out;
// Input:
// in[0] = a7 a6 a5 a4 a3 a2 a1 a0
// in[1] = b7 b6 b5 b4 b3 b2 b1 b0
// in[2] = c7 c6 c5 c4 c3 c2 c1 c0
// Output:
// out[0] = 00 00 a7 a6 a5 a4 a3 a2
// out[1] = 00 00 a1 a0 b7 b6 b5 b4
// out[2] = 00 00 b3 b2 b1 b0 c7 c6
// out[3] = 00 00 c5 c4 c3 c2 c1 c0
// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
out.val[0] = vshrq_n_u8(in.val[0], 2);
out.val[1] = vshrq_n_u8(in.val[1], 4);
out.val[2] = vshrq_n_u8(in.val[2], 6);
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
// Clear the high two bits in the second, third and fourth output.
out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
return out;
}

View file

@ -0,0 +1,66 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_SSE41
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("sse4.1"))), apply_to=function)
#else
#pragma GCC target("sse4.1")
#endif
#include <smmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_SSE41_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSE41_USE_ASM 1
# else
# define BASE64_SSE41_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#if BASE64_SSE41_USE_ASM
# include "../ssse3/enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_SSE41
void
base64_stream_encode_sse41 BASE64_ENC_PARAMS
{
#if HAVE_SSE41
#include "../generic/enc_head.c"
enc_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_sse41 BASE64_DEC_PARAMS
{
#if HAVE_SSE41
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#if defined(__clang__)
#pragma clang attribute pop
#endif
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,66 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_SSE42
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("sse4.2"))), apply_to=function)
#else
#pragma GCC target("sse4.2")
#endif
#include <nmmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_SSE42_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSE42_USE_ASM 1
# else
# define BASE64_SSE42_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#if BASE64_SSE42_USE_ASM
# include "../ssse3/enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_SSE42
void
base64_stream_encode_sse42 BASE64_ENC_PARAMS
{
#if HAVE_SSE42
#include "../generic/enc_head.c"
enc_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_sse42 BASE64_DEC_PARAMS
{
#if HAVE_SSE42
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#if defined(__clang__)
#pragma clang attribute pop
#endif
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,68 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_SSSE3
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("ssse3"))), apply_to=function)
#else
#pragma GCC target("ssse3")
#endif
#include <tmmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
// registers, which is not enough to run the inline assembly.
#ifndef BASE64_SSSE3_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSSE3_USE_ASM 1
# else
# define BASE64_SSSE3_USE_ASM 0
# endif
#endif
#include "dec_reshuffle.c"
#include "dec_loop.c"
#if BASE64_SSSE3_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_reshuffle.c"
# include "enc_translate.c"
# include "enc_loop.c"
#endif
#endif // HAVE_SSSE3
void
base64_stream_encode_ssse3 BASE64_ENC_PARAMS
{
#if HAVE_SSSE3
#include "../generic/enc_head.c"
enc_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_ssse3 BASE64_DEC_PARAMS
{
#if HAVE_SSSE3
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#if defined(__clang__)
#pragma clang attribute pop
#endif
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View file

@ -0,0 +1,173 @@
// The input consists of six character sets in the Base64 alphabet, which we
// need to map back to the 6-bit values they represent. There are three ranges,
// two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input
//
// We will use lookup tables for character validation and offset computation.
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
// allows to mask with 0x2F instead of 0x0F and thus save one constant
// declaration (register and/or memory access).
//
// For offsets:
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
// 0000 = garbage
// 0001 = /
// 0010 = +
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = a-z
// 0111 = a-z
// 1000 >= garbage
//
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
//
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
//
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
//
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
//
// 0100 0x04 char @ A B C D E F G H I J K L M N O
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 0110 0x04 char ` a b c d e f g h i j k l m n o
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
// 0111 0x08 char p q r s t u v w x y z { | } ~
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
static BASE64_FORCE_INLINE int
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m128i lut_lo = _mm_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
const __m128i lut_hi = _mm_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
const __m128i lut_roll = _mm_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0);
const __m128i mask_2F = _mm_set1_epi8(0x2F);
// Load input:
__m128i str = _mm_loadu_si128((__m128i *) *s);
// Table lookups:
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
// Check for invalid input: if any "and" values from lo and hi are not
// zero, fall back on bytewise code to do error checking and reporting:
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
return 0;
}
const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
// Now simply add the delta values to the input:
str = _mm_add_epi8(str, roll);
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
// Store the output:
_mm_storeu_si128((__m128i *) *o, str);
*s += 16;
*o += 12;
*rounds -= 1;
return 1;
}
static inline void
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 24) {
return;
}
// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
// written after the output, ensure that there will be at least 8 bytes
// of input data left to cover the gap. (6 data bytes and up to two
// end-of-string markers.)
size_t rounds = (*slen - 8) / 16;
*slen -= rounds * 16; // 16 bytes consumed per round
*olen += rounds * 12; // 12 bytes produced per round
do {
if (rounds >= 8) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_ssse3_inner(s, o, &rounds);
break;
} while (rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 16;
*olen -= rounds * 12;
}

View file

@ -0,0 +1,33 @@
static BASE64_FORCE_INLINE __m128i
dec_reshuffle (const __m128i in)
{
// in, bits, upper case are most significant bits, lower case are least significant bits
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
// Pack bytes together:
return _mm_shuffle_epi8(out, _mm_setr_epi8(
2, 1, 0,
6, 5, 4,
10, 9, 8,
14, 13, 12,
-1, -1, -1, -1));
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
}

View file

@ -0,0 +1,67 @@
static BASE64_FORCE_INLINE void
enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
__m128i str = _mm_loadu_si128((__m128i *) *s);
// Reshuffle:
str = enc_reshuffle(str);
// Translate reshuffled bytes to the Base64 alphabet:
str = enc_translate(str);
// Store:
_mm_storeu_si128((__m128i *) *o, str);
*s += 12;
*o += 16;
}
static inline void
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Because blocks are loaded 16
// bytes at a time, ensure that there will be at least 4 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer:
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
do {
if (rounds >= 8) {
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_ssse3_inner(s, o);
break;
} while (rounds > 0);
}

View file

@ -0,0 +1,268 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round.
#define LOAD(R0, ROUND) \
"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1) \
"pshufb %[lut0], %["R0"] \n\t" \
"movdqa %["R0"], %["R1"] \n\t" \
"pand %[msk0], %["R0"] \n\t" \
"pand %[msk2], %["R1"] \n\t" \
"pmulhuw %[msk1], %["R0"] \n\t" \
"pmullw %[msk3], %["R1"] \n\t" \
"por %["R1"], %["R0"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"movdqa %["R0"], %["R1"] \n\t" \
"movdqa %["R0"], %["R2"] \n\t" \
"psubusb %[n51], %["R1"] \n\t" \
"pcmpgtb %[n25], %["R2"] \n\t" \
"psubb %["R2"], %["R1"] \n\t" \
"movdqa %[lut1], %["R2"] \n\t" \
"pshufb %["R1"], %["R2"] \n\t" \
"paddb %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0) \
SHUF("a", "b") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $12, %[src] \n\t" \
"add $16, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0) /* + */ \
SHUF("a", "d") /* | + */ \
LOAD("b", 1) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3)) /* V V V + */ \
SHUF(B, E) /* | | | | + */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4)) /* | | | + */ \
SHUF(C, A) /* + | | | | */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5)) /* | | | + */ \
SHUF(D, A) /* + | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A) /* + V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C) /* + | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
// bytes, so "reserve" four bytes from the input buffer to ensure that
// we never read beyond the end of the input buffer.
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m128i lut0 = _mm_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
const __m128i lut1 = _mm_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m128i a, b, c, d, e, f;
__asm__ volatile (
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(12 * 36), %[src] \n\t"
" add $(16 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(12 * 18), %[src] \n\t"
" add $(16 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(12 * 9), %[src] \n\t"
" add $(16 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(12 * 6), %[src] \n\t"
" add $(16 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "=&x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm_set1_epi32(0x04000040)),
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm_set1_epi32(0x01000010)),
[n51] "x" (_mm_set1_epi8(51)),
[n25] "x" (_mm_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View file

@ -0,0 +1,48 @@
static BASE64_FORCE_INLINE __m128i
enc_reshuffle (__m128i in)
{
// Input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a
in = _mm_shuffle_epi8(in, _mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1));
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
return _mm_or_si128(t1, t3);
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

View file

@ -0,0 +1,33 @@
static BASE64_FORCE_INLINE __m128i
enc_translate (const __m128i in)
{
// A lookup table containing the absolute offsets for all ranges:
const __m128i lut = _mm_setr_epi8(
65, 71, -4, -4,
-4, -4, -4, -4,
-4, -4, -4, -4,
-19, -16, 0, 0
);
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from the input. The index for range #0 is right,
// others are 1 less than expected:
__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
// now correct:
indices = _mm_sub_epi8(indices, mask);
// Add offsets to input values:
return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
}

View file

@ -0,0 +1,314 @@
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include "libbase64.h"
#include "codecs.h"
#include "config.h"
#include "env.h"
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
#define BASE64_X86
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
#define BASE64_X86_SIMD
#endif
#endif
#ifdef BASE64_X86
#ifdef _MSC_VER
#include <intrin.h>
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
{ \
int info[4]; \
__cpuidex(info, __level, __count); \
__eax = info[0]; \
__ebx = info[1]; \
__ecx = info[2]; \
__edx = info[3]; \
}
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
#else
#include <cpuid.h>
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
static inline uint64_t _xgetbv (uint32_t index)
{
uint32_t eax, edx;
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
return ((uint64_t)edx << 32) | eax;
}
#else
#error "Platform not supported"
#endif
#endif
#endif
#ifndef bit_AVX512vl
#define bit_AVX512vl (1 << 31)
#endif
#ifndef bit_AVX512vbmi
#define bit_AVX512vbmi (1 << 1)
#endif
#ifndef bit_AVX2
#define bit_AVX2 (1 << 5)
#endif
#ifndef bit_SSSE3
#define bit_SSSE3 (1 << 9)
#endif
#ifndef bit_SSE41
#define bit_SSE41 (1 << 19)
#endif
#ifndef bit_SSE42
#define bit_SSE42 (1 << 20)
#endif
#ifndef bit_AVX
#define bit_AVX (1 << 28)
#endif
#define bit_XSAVE_XRSTORE (1 << 27)
#ifndef _XCR_XFEATURE_ENABLED_MASK
#define _XCR_XFEATURE_ENABLED_MASK 0
#endif
#define bit_XMM (1 << 1)
#define bit_YMM (1 << 2)
#define bit_OPMASK (1 << 5)
#define bit_ZMM (1 << 6)
#define bit_HIGH_ZMM (1 << 7)
#define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS (bit_XMM | bit_YMM)
#define _AVX_512_ENABLED_BY_OS (bit_XMM | bit_YMM | bit_OPMASK | bit_ZMM | bit_HIGH_ZMM)
#endif
// Function declarations:
#define BASE64_CODEC_FUNCS(arch) \
extern void base64_stream_encode_ ## arch BASE64_ENC_PARAMS; \
extern int base64_stream_decode_ ## arch BASE64_DEC_PARAMS;
BASE64_CODEC_FUNCS(avx512)
BASE64_CODEC_FUNCS(avx2)
BASE64_CODEC_FUNCS(neon32)
BASE64_CODEC_FUNCS(neon64)
BASE64_CODEC_FUNCS(plain)
BASE64_CODEC_FUNCS(ssse3)
BASE64_CODEC_FUNCS(sse41)
BASE64_CODEC_FUNCS(sse42)
BASE64_CODEC_FUNCS(avx)
static bool
codec_choose_forced (struct codec *codec, int flags)
{
// If the user wants to use a certain codec,
// always allow it, even if the codec is a no-op.
// For testing purposes.
if (!(flags & 0xFFFF)) {
return false;
}
if (flags & BASE64_FORCE_AVX2) {
codec->enc = base64_stream_encode_avx2;
codec->dec = base64_stream_decode_avx2;
return true;
}
if (flags & BASE64_FORCE_NEON32) {
codec->enc = base64_stream_encode_neon32;
codec->dec = base64_stream_decode_neon32;
return true;
}
if (flags & BASE64_FORCE_NEON64) {
codec->enc = base64_stream_encode_neon64;
codec->dec = base64_stream_decode_neon64;
return true;
}
if (flags & BASE64_FORCE_PLAIN) {
codec->enc = base64_stream_encode_plain;
codec->dec = base64_stream_decode_plain;
return true;
}
if (flags & BASE64_FORCE_SSSE3) {
codec->enc = base64_stream_encode_ssse3;
codec->dec = base64_stream_decode_ssse3;
return true;
}
if (flags & BASE64_FORCE_SSE41) {
codec->enc = base64_stream_encode_sse41;
codec->dec = base64_stream_decode_sse41;
return true;
}
if (flags & BASE64_FORCE_SSE42) {
codec->enc = base64_stream_encode_sse42;
codec->dec = base64_stream_decode_sse42;
return true;
}
if (flags & BASE64_FORCE_AVX) {
codec->enc = base64_stream_encode_avx;
codec->dec = base64_stream_decode_avx;
return true;
}
if (flags & BASE64_FORCE_AVX512) {
codec->enc = base64_stream_encode_avx512;
codec->dec = base64_stream_decode_avx512;
return true;
}
return false;
}
static bool
codec_choose_arm (struct codec *codec)
{
#if HAVE_NEON64 || ((defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32)
// Unfortunately there is no portable way to check for NEON
// support at runtime from userland in the same way that x86
// has cpuid, so just stick to the compile-time configuration:
#if HAVE_NEON64
codec->enc = base64_stream_encode_neon64;
codec->dec = base64_stream_decode_neon64;
#else
codec->enc = base64_stream_encode_neon32;
codec->dec = base64_stream_decode_neon32;
#endif
return true;
#else
(void)codec;
return false;
#endif
}
static bool
codec_choose_x86 (struct codec *codec)
{
#ifdef BASE64_X86_SIMD
unsigned int eax, ebx = 0, ecx = 0, edx;
unsigned int max_level;
#ifdef _MSC_VER
int info[4];
__cpuidex(info, 0, 0);
max_level = info[0];
#else
max_level = __get_cpuid_max(0, NULL);
#endif
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
// Check for AVX/AVX2/AVX512 support:
// Checking for AVX requires 3 things:
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
// (allowing saving YMM registers on context switch)
// 2) CPUID indicates support for AVX
// 3) XGETBV indicates the AVX registers will be saved and restored on
// context switch
//
// Note that XGETBV is only available on 686 or later CPUs, so the
// instruction needs to be conditionally run.
if (max_level >= 1) {
__cpuid_count(1, 0, eax, ebx, ecx, edx);
if (ecx & bit_XSAVE_XRSTORE) {
uint64_t xcr_mask;
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
#if HAVE_AVX512
if (max_level >= 7 && ((xcr_mask & _AVX_512_ENABLED_BY_OS) == _AVX_512_ENABLED_BY_OS)) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
codec->enc = base64_stream_encode_avx512;
codec->dec = base64_stream_decode_avx512;
return true;
}
}
#endif
#if HAVE_AVX2
if (max_level >= 7) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & bit_AVX2) {
codec->enc = base64_stream_encode_avx2;
codec->dec = base64_stream_decode_avx2;
return true;
}
}
#endif
#if HAVE_AVX
__cpuid_count(1, 0, eax, ebx, ecx, edx);
if (ecx & bit_AVX) {
codec->enc = base64_stream_encode_avx;
codec->dec = base64_stream_decode_avx;
return true;
}
#endif
}
}
}
#endif
#if HAVE_SSE42
// Check for SSE42 support:
if (max_level >= 1) {
__cpuid(1, eax, ebx, ecx, edx);
if (ecx & bit_SSE42) {
codec->enc = base64_stream_encode_sse42;
codec->dec = base64_stream_decode_sse42;
return true;
}
}
#endif
#if HAVE_SSE41
// Check for SSE41 support:
if (max_level >= 1) {
__cpuid(1, eax, ebx, ecx, edx);
if (ecx & bit_SSE41) {
codec->enc = base64_stream_encode_sse41;
codec->dec = base64_stream_decode_sse41;
return true;
}
}
#endif
#if HAVE_SSSE3
// Check for SSSE3 support:
if (max_level >= 1) {
__cpuid(1, eax, ebx, ecx, edx);
if (ecx & bit_SSSE3) {
codec->enc = base64_stream_encode_ssse3;
codec->dec = base64_stream_decode_ssse3;
return true;
}
}
#endif
#else
(void)codec;
#endif
return false;
}
void
codec_choose (struct codec *codec, int flags)
{
// User forced a codec:
if (codec_choose_forced(codec, flags)) {
return;
}
// Runtime feature detection:
if (codec_choose_arm(codec)) {
return;
}
if (codec_choose_x86(codec)) {
return;
}
codec->enc = base64_stream_encode_plain;
codec->dec = base64_stream_decode_plain;
}

View file

@ -0,0 +1,57 @@
#include "libbase64.h"
// Function parameters for encoding functions:
#define BASE64_ENC_PARAMS \
( struct base64_state *state \
, const char *src \
, size_t srclen \
, char *out \
, size_t *outlen \
)
// Function parameters for decoding functions:
#define BASE64_DEC_PARAMS \
( struct base64_state *state \
, const char *src \
, size_t srclen \
, char *out \
, size_t *outlen \
)
// This function is used as a stub when a certain encoder is not compiled in.
// It discards the inputs and returns zero output bytes.
static inline void
base64_enc_stub BASE64_ENC_PARAMS
{
(void) state;
(void) src;
(void) srclen;
(void) out;
*outlen = 0;
}
// This function is used as a stub when a certain decoder is not compiled in.
// It discards the inputs and returns an invalid decoding result.
static inline int
base64_dec_stub BASE64_DEC_PARAMS
{
(void) state;
(void) src;
(void) srclen;
(void) out;
(void) outlen;
return -1;
}
typedef void (* base64_enc_fn) BASE64_ENC_PARAMS;
typedef int (* base64_dec_fn) BASE64_DEC_PARAMS;
struct codec
{
base64_enc_fn enc;
base64_dec_fn dec;
};
extern void codec_choose (struct codec *, int flags);

View file

@ -0,0 +1,22 @@
#ifndef BASE64_CONFIG_H
#define BASE64_CONFIG_H
#if !defined(__APPLE__) && ((defined(__x86_64__) && defined(__LP64__)) || defined(_M_X64))
#define HAVE_SSSE3 1
#define HAVE_SSE41 1
#define HAVE_SSE42 1
#define HAVE_AVX 1
#define HAVE_AVX2 1
#define HAVE_AVX512 0
#elif (defined(__APPLE__) && defined(__aarch64__))
#define HAVE_NEON64 1
#elif (defined(__wasm__) && defined(__wasm_simd128__))
#include "emscripten/version.h"
#if __EMSCRIPTEN_major__ == 3
#define HAVE_NEON32 1
#elif __EMSCRIPTEN_major__ > 3
#define HAVE_NEON64 1
#endif
#endif
#endif // BASE64_CONFIG_H

View file

@ -0,0 +1,84 @@
#ifndef BASE64_ENV_H
#define BASE64_ENV_H
#include <stdint.h>
// This header file contains macro definitions that describe certain aspects of
// the compile-time environment. Compatibility and portability macros go here.
// Define machine endianness. This is for GCC:
#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
# define BASE64_LITTLE_ENDIAN 1
#else
# define BASE64_LITTLE_ENDIAN 0
#endif
// This is for Clang:
#ifdef __LITTLE_ENDIAN__
# define BASE64_LITTLE_ENDIAN 1
#endif
#ifdef __BIG_ENDIAN__
# define BASE64_LITTLE_ENDIAN 0
#endif
// MSVC++ needs intrin.h for _byteswap_uint64 (issue #68):
#if BASE64_LITTLE_ENDIAN && defined(_MSC_VER)
# include <intrin.h>
#endif
// Endian conversion functions:
#if BASE64_LITTLE_ENDIAN
# ifdef _MSC_VER
// Microsoft Visual C++:
# define BASE64_HTOBE32(x) _byteswap_ulong(x)
# define BASE64_HTOBE64(x) _byteswap_uint64(x)
# else
// GCC and Clang:
# define BASE64_HTOBE32(x) __builtin_bswap32(x)
# define BASE64_HTOBE64(x) __builtin_bswap64(x)
# endif
#else
// No conversion needed:
# define BASE64_HTOBE32(x) (x)
# define BASE64_HTOBE64(x) (x)
#endif
// Detect word size:
#if defined (__x86_64__)
// This also works for the x32 ABI, which has a 64-bit word size.
# define BASE64_WORDSIZE 64
#elif SIZE_MAX == UINT32_MAX
# define BASE64_WORDSIZE 32
#elif SIZE_MAX == UINT64_MAX
# define BASE64_WORDSIZE 64
#else
# error BASE64_WORDSIZE_NOT_DEFINED
#endif
// End-of-file definitions.
// Almost end-of-file when waiting for the last '=' character:
#define BASE64_AEOF 1
// End-of-file when stream end has been reached or invalid input provided:
#define BASE64_EOF 2
// GCC 7 defaults to issuing a warning for fallthrough in switch statements,
// unless the fallthrough cases are marked with an attribute. As we use
// fallthrough deliberately, define an alias for the attribute:
#if __GNUC__ >= 7
# define BASE64_FALLTHROUGH __attribute__((fallthrough));
#else
# define BASE64_FALLTHROUGH
#endif
// Declare macros to ensure that functions that are intended to be inlined, are
// actually inlined, even when no optimization is applied. A lot of inner loop
// code is factored into separate functions for reasons of readability, but
// that code should always be inlined (and optimized) in the main loop.
#ifdef _MSC_VER
# define BASE64_FORCE_INLINE __forceinline
#else
# define BASE64_FORCE_INLINE inline __attribute__((always_inline))
#endif
#endif // BASE64_ENV_H

View file

@ -0,0 +1,164 @@
#include <stdint.h>
#include <stddef.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "libbase64.h"
#include "tables/tables.h"
#include "codecs.h"
#include "env.h"
// These static function pointers are initialized once when the library is
// first used, and remain in use for the remaining lifetime of the program.
// The idea being that CPU features don't change at runtime.
static struct codec codec = { NULL, NULL };
void
base64_stream_encode_init (struct base64_state *state, int flags)
{
// If any of the codec flags are set, redo choice:
if (codec.enc == NULL || flags & 0xFF) {
codec_choose(&codec, flags);
}
state->eof = 0;
state->bytes = 0;
state->carry = 0;
state->flags = flags;
}
void
base64_stream_encode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
)
{
codec.enc(state, src, srclen, out, outlen);
}
void
base64_stream_encode_final
( struct base64_state *state
, char *out
, size_t *outlen
)
{
uint8_t *o = (uint8_t *)out;
if (state->bytes == 1) {
*o++ = base64_table_enc_6bit[state->carry];
*o++ = '=';
*o++ = '=';
*outlen = 3;
return;
}
if (state->bytes == 2) {
*o++ = base64_table_enc_6bit[state->carry];
*o++ = '=';
*outlen = 2;
return;
}
*outlen = 0;
}
void
base64_stream_decode_init (struct base64_state *state, int flags)
{
// If any of the codec flags are set, redo choice:
if (codec.dec == NULL || flags & 0xFFFF) {
codec_choose(&codec, flags);
}
state->eof = 0;
state->bytes = 0;
state->carry = 0;
state->flags = flags;
}
int
base64_stream_decode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
)
{
return codec.dec(state, src, srclen, out, outlen);
}
#ifdef _OPENMP
// Due to the overhead of initializing OpenMP and creating a team of
// threads, we require the data length to be larger than a threshold:
#define OMP_THRESHOLD 20000
// Conditionally include OpenMP-accelerated codec implementations:
#include "lib_openmp.c"
#endif
void
base64_encode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
)
{
size_t s;
size_t t;
struct base64_state state;
#ifdef _OPENMP
if (srclen >= OMP_THRESHOLD) {
base64_encode_openmp(src, srclen, out, outlen, flags);
return;
}
#endif
// Init the stream reader:
base64_stream_encode_init(&state, flags);
// Feed the whole string to the stream reader:
base64_stream_encode(&state, src, srclen, out, &s);
// Finalize the stream by writing trailer if any:
base64_stream_encode_final(&state, out + s, &t);
// Final output length is stream length plus tail:
*outlen = s + t;
}
int
base64_decode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
)
{
int ret;
struct base64_state state;
#ifdef _OPENMP
if (srclen >= OMP_THRESHOLD) {
return base64_decode_openmp(src, srclen, out, outlen, flags);
}
#endif
// Init the stream reader:
base64_stream_decode_init(&state, flags);
// Feed the whole string to the stream reader:
ret = base64_stream_decode(&state, src, srclen, out, outlen);
// If when decoding a whole block, we're still waiting for input then fail:
if (ret && (state.bytes == 0)) {
return ret;
}
return 0;
}

View file

@ -0,0 +1,149 @@
// This code makes some assumptions on the implementation of
// base64_stream_encode_init(), base64_stream_encode() and base64_stream_decode().
// Basically these assumptions boil down to that when breaking the src into
// parts, out parts can be written without side effects.
// This is met when:
// 1) base64_stream_encode() and base64_stream_decode() don't use globals;
// 2) the shared variables src and out are not read or written outside of the
// bounds of their parts, i.e. when base64_stream_encode() reads a multiple
// of 3 bytes, it must write no more then a multiple of 4 bytes, not even
// temporarily;
// 3) the state flag can be discarded after base64_stream_encode() and
// base64_stream_decode() on the parts.
static inline void
base64_encode_openmp
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
)
{
size_t s;
size_t t;
size_t sum = 0, len, last_len;
struct base64_state state, initial_state;
int num_threads, i;
// Request a number of threads but not necessarily get them:
#pragma omp parallel
{
// Get the number of threads used from one thread only,
// as num_threads is a shared var:
#pragma omp single
{
num_threads = omp_get_num_threads();
// Split the input string into num_threads parts, each
// part a multiple of 3 bytes. The remaining bytes will
// be done later:
len = srclen / (num_threads * 3);
len *= 3;
last_len = srclen - num_threads * len;
// Init the stream reader:
base64_stream_encode_init(&state, flags);
initial_state = state;
}
// Single has an implicit barrier for all threads to wait here
// for the above to complete:
#pragma omp for firstprivate(state) private(s) reduction(+:sum) schedule(static,1)
for (i = 0; i < num_threads; i++)
{
// Feed each part of the string to the stream reader:
base64_stream_encode(&state, src + i * len, len, out + i * len * 4 / 3, &s);
sum += s;
}
}
// As encoding should never fail and we encode an exact multiple
// of 3 bytes, we can discard state:
state = initial_state;
// Encode the remaining bytes:
base64_stream_encode(&state, src + num_threads * len, last_len, out + num_threads * len * 4 / 3, &s);
// Finalize the stream by writing trailer if any:
base64_stream_encode_final(&state, out + num_threads * len * 4 / 3 + s, &t);
// Final output length is stream length plus tail:
sum += s + t;
*outlen = sum;
}
static inline int
base64_decode_openmp
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
)
{
int num_threads, result = 0, i;
size_t sum = 0, len, last_len, s;
struct base64_state state, initial_state;
// Request a number of threads but not necessarily get them:
#pragma omp parallel
{
// Get the number of threads used from one thread only,
// as num_threads is a shared var:
#pragma omp single
{
num_threads = omp_get_num_threads();
// Split the input string into num_threads parts, each
// part a multiple of 4 bytes. The remaining bytes will
// be done later:
len = srclen / (num_threads * 4);
len *= 4;
last_len = srclen - num_threads * len;
// Init the stream reader:
base64_stream_decode_init(&state, flags);
initial_state = state;
}
// Single has an implicit barrier to wait here for the above to
// complete:
#pragma omp for firstprivate(state) private(s) reduction(+:sum, result) schedule(static,1)
for (i = 0; i < num_threads; i++)
{
int this_result;
// Feed each part of the string to the stream reader:
this_result = base64_stream_decode(&state, src + i * len, len, out + i * len * 3 / 4, &s);
sum += s;
result += this_result;
}
}
// If `result' equals `-num_threads', then all threads returned -1,
// indicating that the requested codec is not available:
if (result == -num_threads) {
return -1;
}
// If `result' does not equal `num_threads', then at least one of the
// threads hit a decode error:
if (result != num_threads) {
return 0;
}
// So far so good, now decode whatever remains in the buffer. Reuse the
// initial state, since we are at a 4-byte boundary:
state = initial_state;
result = base64_stream_decode(&state, src + num_threads * len, last_len, out + num_threads * len * 3 / 4, &s);
sum += s;
*outlen = sum;
// If when decoding a whole block, we're still waiting for input then fail:
if (result && (state.bytes == 0)) {
return result;
}
return 0;
}

View file

@ -0,0 +1,146 @@
#ifndef LIBBASE64_H
#define LIBBASE64_H
#include <stddef.h> /* size_t */
#if defined(_WIN32) || defined(__CYGWIN__)
#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
#define BASE64_SYMBOL_PRIVATE
#elif __GNUC__ >= 4
#define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
#define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
#define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
#else
#define BASE64_SYMBOL_IMPORT
#define BASE64_SYMBOL_EXPORT
#define BASE64_SYMBOL_PRIVATE
#endif
#if defined(BASE64_STATIC_DEFINE)
#define BASE64_EXPORT
#define BASE64_NO_EXPORT
#else
#if defined(BASE64_EXPORTS) // defined if we are building the shared library
#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
#else
#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
#endif
#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* These are the flags that can be passed in the `flags` argument. The values
* below force the use of a given codec, even if that codec is a no-op in the
* current build. Used in testing. Set to 0 for the default behavior, which is
* runtime feature detection on x86, a compile-time fixed codec on ARM, and
* the plain codec on other platforms: */
#define BASE64_FORCE_AVX2 (1 << 0)
#define BASE64_FORCE_NEON32 (1 << 1)
#define BASE64_FORCE_NEON64 (1 << 2)
#define BASE64_FORCE_PLAIN (1 << 3)
#define BASE64_FORCE_SSSE3 (1 << 4)
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_AVX512 (1 << 8)
struct base64_state {
int eof;
int bytes;
int flags;
unsigned char carry;
};
/* Wrapper function to encode a plain string of given length. Output is written
* to *out without trailing zero. Output length in bytes is written to *outlen.
* The buffer in `out` has been allocated by the caller and is at least 4/3 the
* size of the input. See above for `flags`; set to 0 for default operation: */
void BASE64_EXPORT base64_encode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
) ;
/* Call this before calling base64_stream_encode() to init the state. See above
* for `flags`; set to 0 for default operation: */
void BASE64_EXPORT base64_stream_encode_init
( struct base64_state *state
, int flags
) ;
/* Encodes the block of data of given length at `src`, into the buffer at
* `out`. Caller is responsible for allocating a large enough out-buffer; it
* must be at least 4/3 the size of the in-buffer, but take some margin. Places
* the number of new bytes written into `outlen` (which is set to zero when the
* function starts). Does not zero-terminate or finalize the output. */
void BASE64_EXPORT base64_stream_encode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
) ;
/* Finalizes the output begun by previous calls to `base64_stream_encode()`.
* Adds the required end-of-stream markers if appropriate. `outlen` is modified
* and will contain the number of new bytes written at `out` (which will quite
* often be zero). */
void BASE64_EXPORT base64_stream_encode_final
( struct base64_state *state
, char *out
, size_t *outlen
) ;
/* Wrapper function to decode a plain string of given length. Output is written
* to *out without trailing zero. Output length in bytes is written to *outlen.
* The buffer in `out` has been allocated by the caller and is at least 3/4 the
* size of the input. See above for `flags`, set to 0 for default operation: */
int BASE64_EXPORT base64_decode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
) ;
/* Call this before calling base64_stream_decode() to init the state. See above
* for `flags`; set to 0 for default operation: */
void BASE64_EXPORT base64_stream_decode_init
( struct base64_state *state
, int flags
) ;
/* Decodes the block of data of given length at `src`, into the buffer at
* `out`. Caller is responsible for allocating a large enough out-buffer; it
* must be at least 3/4 the size of the in-buffer, but take some margin. Places
* the number of new bytes written into `outlen` (which is set to zero when the
* function starts). Does not zero-terminate the output. Returns 1 if all is
* well, and 0 if a decoding error was found, such as an invalid character.
* Returns -1 if the chosen codec is not included in the current build. Used by
* the test harness to check whether a codec is available for testing. */
int BASE64_EXPORT base64_stream_decode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
) ;
#ifdef __cplusplus
}
#endif
#endif /* LIBBASE64_H */

View file

@ -0,0 +1,387 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdbool.h>
#define BASE64_EXPORTS
#include "librt_base64.h"
#include "libbase64.h"
#include "pythoncapi_compat.h"
static PyObject *
b64decode_handle_invalid_input(
PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen, bool freesrc);
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
#define STACK_BUFFER_SIZE 1024
static void
convert_encoded_to_urlsafe(char *buf, size_t len) {
// The loop is written to enable SIMD optimizations
for (size_t i = 0; i < len; i++) {
char ch = buf[i];
if (ch == '+') {
ch = '-';
} else if (ch == '/') {
ch = '_';
}
buf[i] = ch;
}
}
static void
convert_urlsafe_to_encoded(const char *src, size_t len, char *buf) {
// The loop is written to enable SIMD optimizations
for (size_t i = 0; i < len; i++) {
char ch = src[i];
if (ch == '-') {
ch = '+';
} else if (ch == '_') {
ch = '/';
}
buf[i] = ch;
}
}
static PyObject *
b64encode_internal(PyObject *obj, bool urlsafe) {
unsigned char *ascii_data;
char *bin_data;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t bin_len, out_len;
PyBytesWriter *writer;
int newline = 0; // TODO
if (!PyBytes_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "base64() expects a bytes object");
return NULL;
}
bin_data = PyBytes_AS_STRING(obj);
bin_len = PyBytes_GET_SIZE(obj);
assert(bin_len >= 0);
if (bin_len > BASE64_MAXBIN) {
PyErr_SetString(PyExc_ValueError, "Too much data for base64 line");
return NULL;
}
Py_ssize_t buflen = 4 * bin_len / 3 + 4;
char *buf;
char stack_buf[STACK_BUFFER_SIZE];
if (buflen <= STACK_BUFFER_SIZE) {
buf = stack_buf;
} else {
buf = PyMem_Malloc(buflen);
if (buf == NULL) {
return PyErr_NoMemory();
}
}
size_t actual_len;
base64_encode(bin_data, bin_len, buf, &actual_len, 0);
if (urlsafe) {
convert_encoded_to_urlsafe(buf, actual_len);
}
PyObject *res = PyBytes_FromStringAndSize(buf, actual_len);
if (buflen > STACK_BUFFER_SIZE)
PyMem_Free(buf);
return res;
}
static PyObject*
b64encode(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 1) {
PyErr_SetString(PyExc_TypeError, "b64encode() takes exactly one argument");
return 0;
}
return b64encode_internal(args[0], false);
}
static PyObject*
urlsafe_b64encode(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 1) {
PyErr_SetString(PyExc_TypeError, "urlsafe_b64encode() takes exactly one argument");
return 0;
}
return b64encode_internal(args[0], true);
}
static inline int
is_valid_base64_char(char c, bool allow_padding) {
return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') || (c == '+') || (c == '/') || (allow_padding && c == '='));
}
static PyObject *
b64decode_internal(PyObject *arg, bool urlsafe) {
const char *src;
Py_ssize_t srclen_ssz;
// Get input pointer and length
if (PyBytes_Check(arg)) {
src = PyBytes_AS_STRING(arg);
srclen_ssz = PyBytes_GET_SIZE(arg);
} else if (PyUnicode_Check(arg)) {
if (!PyUnicode_IS_ASCII(arg)) {
PyErr_SetString(PyExc_ValueError,
"string argument should contain only ASCII characters");
return NULL;
}
src = (const char *)PyUnicode_1BYTE_DATA(arg);
srclen_ssz = PyUnicode_GET_LENGTH(arg);
} else {
PyErr_SetString(PyExc_TypeError,
"argument should be a bytes-like object or ASCII string");
return NULL;
}
// Fast-path: empty input
if (srclen_ssz == 0) {
return PyBytes_FromStringAndSize(NULL, 0);
}
if (urlsafe) {
char *new_src = PyMem_Malloc(srclen_ssz + 1);
if (new_src == NULL) {
return PyErr_NoMemory();
}
convert_urlsafe_to_encoded(src, srclen_ssz, new_src);
src = new_src;
}
// Quickly ignore invalid characters at the end. Other invalid characters
// are also accepted, but they need a slow path.
while (srclen_ssz > 0 && !is_valid_base64_char(src[srclen_ssz - 1], true)) {
srclen_ssz--;
}
// Compute an output capacity that's at least 3/4 of input, without overflow:
// ceil(3/4 * N) == N - floor(N/4)
size_t srclen = (size_t)srclen_ssz;
size_t max_out = srclen - (srclen / 4);
if (max_out == 0) {
max_out = 1; // defensive (srclen > 0 implies >= 1 anyway)
}
if (max_out > (size_t)PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_OverflowError, "input too large");
return NULL;
}
// Allocate output bytes (uninitialized) of the max capacity
PyObject *out_bytes = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)max_out);
if (out_bytes == NULL) {
if (urlsafe) {
PyMem_Free((void *)src);
}
return NULL; // Propagate memory error
}
char *outbuf = PyBytes_AS_STRING(out_bytes);
size_t outlen = max_out;
int ret = base64_decode(src, srclen, outbuf, &outlen, 0);
if (ret != 1) {
if (ret == 0) {
// Slow path: handle non-base64 input
return b64decode_handle_invalid_input(out_bytes, outbuf, max_out, src, srclen, urlsafe);
}
Py_DECREF(out_bytes);
if (urlsafe) {
PyMem_Free((void *)src);
}
if (ret == -1) {
PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build");
} else {
PyErr_SetString(PyExc_RuntimeError, "base64_decode failed");
}
return NULL;
}
if (urlsafe) {
PyMem_Free((void *)src);
}
// Sanity-check contract (decoder must not overflow our buffer)
if (outlen > max_out) {
Py_DECREF(out_bytes);
PyErr_SetString(PyExc_RuntimeError, "decoder wrote past output buffer");
return NULL;
}
// Shrink in place to the actual decoded length
if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) {
// _PyBytes_Resize sets an exception and may free the old object
return NULL;
}
return out_bytes;
}
// Process non-base64 input by ignoring non-base64 characters, for compatibility
// with stdlib b64decode.
static PyObject *
b64decode_handle_invalid_input(
PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen, bool freesrc)
{
// Copy input to a temporary buffer, with non-base64 characters and extra suffix
// characters removed
size_t newbuf_len = 0;
char *newbuf = PyMem_Malloc(srclen);
if (newbuf == NULL) {
Py_DECREF(out_bytes);
if (freesrc) {
PyMem_Free((void *)src);
}
return PyErr_NoMemory();
}
// Copy base64 characters and some padding to the new buffer
for (size_t i = 0; i < srclen; i++) {
char c = src[i];
if (is_valid_base64_char(c, false)) {
newbuf[newbuf_len++] = c;
} else if (c == '=') {
// Copy a necessary amount of padding
int remainder = newbuf_len % 4;
if (remainder == 0) {
// No padding needed
break;
}
int numpad = 4 - remainder;
// Check that there is at least the required amount padding (CPython ignores
// extra padding)
while (numpad > 0) {
if (i == srclen || src[i] != '=') {
break;
}
newbuf[newbuf_len++] = '=';
i++;
numpad--;
// Skip non-base64 alphabet characters within padding
while (i < srclen && !is_valid_base64_char(src[i], true)) {
i++;
}
}
break;
}
}
// Stdlib always performs a non-strict padding check
if (newbuf_len % 4 != 0) {
if (freesrc) {
PyMem_Free((void *)src);
}
Py_DECREF(out_bytes);
PyMem_Free(newbuf);
PyErr_SetString(PyExc_ValueError, "Incorrect padding");
return NULL;
}
size_t outlen = max_out;
int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0);
PyMem_Free(newbuf);
if (freesrc) {
PyMem_Free((void *)src);
}
if (ret != 1) {
Py_DECREF(out_bytes);
if (ret == 0) {
PyErr_SetString(PyExc_ValueError, "Only base64 data is allowed");
}
if (ret == -1) {
PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build");
} else {
PyErr_SetString(PyExc_RuntimeError, "base64_decode failed");
}
return NULL;
}
// Shrink in place to the actual decoded length
if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) {
// _PyBytes_Resize sets an exception and may free the old object
return NULL;
}
return out_bytes;
}
static PyObject*
b64decode(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 1) {
PyErr_SetString(PyExc_TypeError, "b64decode() takes exactly one argument");
return 0;
}
return b64decode_internal(args[0], false);
}
static PyObject*
urlsafe_b64decode(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 1) {
PyErr_SetString(PyExc_TypeError, "urlsafe_b64decode() takes exactly one argument");
return 0;
}
return b64decode_internal(args[0], true);
}
static PyMethodDef librt_base64_module_methods[] = {
{"b64encode", (PyCFunction)b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes object using Base64.")},
{"b64decode", (PyCFunction)b64decode, METH_FASTCALL, PyDoc_STR("Decode a Base64 encoded bytes object or ASCII string.")},
{"urlsafe_b64encode", (PyCFunction)urlsafe_b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes object using URL and file system safe Base64 alphabet.")},
{"urlsafe_b64decode", (PyCFunction)urlsafe_b64decode, METH_FASTCALL, PyDoc_STR("Decode bytes or ASCII string using URL and file system safe Base64 alphabet.")},
{NULL, NULL, 0, NULL}
};
static int
base64_abi_version(void) {
return LIBRT_BASE64_ABI_VERSION;
}
static int
base64_api_version(void) {
return LIBRT_BASE64_API_VERSION;
}
static int
librt_base64_module_exec(PyObject *m)
{
// Export mypy internal C API, be careful with the order!
static void *base64_api[LIBRT_BASE64_API_LEN] = {
(void *)base64_abi_version,
(void *)base64_api_version,
(void *)b64encode_internal,
(void *)b64decode_internal,
};
PyObject *c_api_object = PyCapsule_New((void *)base64_api, "librt.base64._C_API", NULL);
if (PyModule_Add(m, "_C_API", c_api_object) < 0) {
return -1;
}
return 0;
}
static PyModuleDef_Slot librt_base64_module_slots[] = {
{Py_mod_exec, librt_base64_module_exec},
#ifdef Py_MOD_GIL_NOT_USED
{Py_mod_gil, Py_MOD_GIL_NOT_USED},
#endif
{0, NULL}
};
static PyModuleDef librt_base64_module = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "base64",
.m_doc = "Fast base64 encoding and decoding optimized for mypyc",
.m_size = 0,
.m_methods = librt_base64_module_methods,
.m_slots = librt_base64_module_slots,
};
PyMODINIT_FUNC
PyInit_base64(void)
{
return PyModuleDef_Init(&librt_base64_module);
}

View file

@ -0,0 +1,50 @@
#ifndef LIBRT_BASE64_H
#define LIBRT_BASE64_H
#include <Python.h>
#define LIBRT_BASE64_ABI_VERSION 1
#define LIBRT_BASE64_API_VERSION 2
#define LIBRT_BASE64_API_LEN 4
static void *LibRTBase64_API[LIBRT_BASE64_API_LEN];
#define LibRTBase64_ABIVersion (*(int (*)(void)) LibRTBase64_API[0])
#define LibRTBase64_APIVersion (*(int (*)(void)) LibRTBase64_API[1])
#define LibRTBase64_b64encode_internal (*(PyObject* (*)(PyObject *source, bool urlsafe)) LibRTBase64_API[2])
#define LibRTBase64_b64decode_internal (*(PyObject* (*)(PyObject *source, bool urlsafe)) LibRTBase64_API[3])
static int
import_librt_base64(void)
{
PyObject *mod = PyImport_ImportModule("librt.base64");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
void *capsule = PyCapsule_Import("librt.base64._C_API", 0);
if (capsule == NULL)
return -1;
memcpy(LibRTBase64_API, capsule, sizeof(LibRTBase64_API));
if (LibRTBase64_ABIVersion() != LIBRT_BASE64_ABI_VERSION) {
char err[128];
snprintf(err, sizeof(err), "ABI version conflict for librt.base64, expected %d, found %d",
LIBRT_BASE64_ABI_VERSION,
LibRTBase64_ABIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
if (LibRTBase64_APIVersion() < LIBRT_BASE64_API_VERSION) {
char err[128];
snprintf(err, sizeof(err),
"API version conflict for librt.base64, expected %d or newer, found %d (hint: upgrade librt)",
LIBRT_BASE64_API_VERSION,
LibRTBase64_APIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
return 0;
}
#endif // LIBRT_BASE64_H

View file

@ -0,0 +1,393 @@
#include <stdint.h>
#define CHAR62 '+'
#define CHAR63 '/'
#define CHARPAD '='
#if BASE64_LITTLE_ENDIAN
/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */
const uint32_t base64_table_dec_32bit_d0[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x000000f8, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000fc,
0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
0x00000064, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
0x000000c4, 0x000000c8, 0x000000cc, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d1[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x0000e003, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000f003,
0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
0x00009001, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
0x00001003, 0x00002003, 0x00003003, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d2[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00800f00, 0xffffffff, 0xffffffff, 0xffffffff, 0x00c00f00,
0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
0x00400600, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
0x00400c00, 0x00800c00, 0x00c00c00, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d3[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x003e0000, 0xffffffff, 0xffffffff, 0xffffffff, 0x003f0000,
0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
0x00190000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
0x00310000, 0x00320000, 0x00330000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
#else
/* SPECIAL DECODE TABLES FOR BIG ENDIAN (IBM/MOTOROLA/SUN) CPUS */
const uint32_t base64_table_dec_32bit_d0[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xf8000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xfc000000,
0xd0000000, 0xd4000000, 0xd8000000, 0xdc000000, 0xe0000000, 0xe4000000,
0xe8000000, 0xec000000, 0xf0000000, 0xf4000000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x04000000, 0x08000000, 0x0c000000, 0x10000000, 0x14000000, 0x18000000,
0x1c000000, 0x20000000, 0x24000000, 0x28000000, 0x2c000000, 0x30000000,
0x34000000, 0x38000000, 0x3c000000, 0x40000000, 0x44000000, 0x48000000,
0x4c000000, 0x50000000, 0x54000000, 0x58000000, 0x5c000000, 0x60000000,
0x64000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x68000000, 0x6c000000, 0x70000000, 0x74000000, 0x78000000,
0x7c000000, 0x80000000, 0x84000000, 0x88000000, 0x8c000000, 0x90000000,
0x94000000, 0x98000000, 0x9c000000, 0xa0000000, 0xa4000000, 0xa8000000,
0xac000000, 0xb0000000, 0xb4000000, 0xb8000000, 0xbc000000, 0xc0000000,
0xc4000000, 0xc8000000, 0xcc000000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d1[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x03e00000, 0xffffffff, 0xffffffff, 0xffffffff, 0x03f00000,
0x03400000, 0x03500000, 0x03600000, 0x03700000, 0x03800000, 0x03900000,
0x03a00000, 0x03b00000, 0x03c00000, 0x03d00000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00100000, 0x00200000, 0x00300000, 0x00400000, 0x00500000, 0x00600000,
0x00700000, 0x00800000, 0x00900000, 0x00a00000, 0x00b00000, 0x00c00000,
0x00d00000, 0x00e00000, 0x00f00000, 0x01000000, 0x01100000, 0x01200000,
0x01300000, 0x01400000, 0x01500000, 0x01600000, 0x01700000, 0x01800000,
0x01900000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x01a00000, 0x01b00000, 0x01c00000, 0x01d00000, 0x01e00000,
0x01f00000, 0x02000000, 0x02100000, 0x02200000, 0x02300000, 0x02400000,
0x02500000, 0x02600000, 0x02700000, 0x02800000, 0x02900000, 0x02a00000,
0x02b00000, 0x02c00000, 0x02d00000, 0x02e00000, 0x02f00000, 0x03000000,
0x03100000, 0x03200000, 0x03300000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d2[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x000f8000, 0xffffffff, 0xffffffff, 0xffffffff, 0x000fc000,
0x000d0000, 0x000d4000, 0x000d8000, 0x000dc000, 0x000e0000, 0x000e4000,
0x000e8000, 0x000ec000, 0x000f0000, 0x000f4000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00004000, 0x00008000, 0x0000c000, 0x00010000, 0x00014000, 0x00018000,
0x0001c000, 0x00020000, 0x00024000, 0x00028000, 0x0002c000, 0x00030000,
0x00034000, 0x00038000, 0x0003c000, 0x00040000, 0x00044000, 0x00048000,
0x0004c000, 0x00050000, 0x00054000, 0x00058000, 0x0005c000, 0x00060000,
0x00064000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00068000, 0x0006c000, 0x00070000, 0x00074000, 0x00078000,
0x0007c000, 0x00080000, 0x00084000, 0x00088000, 0x0008c000, 0x00090000,
0x00094000, 0x00098000, 0x0009c000, 0x000a0000, 0x000a4000, 0x000a8000,
0x000ac000, 0x000b0000, 0x000b4000, 0x000b8000, 0x000bc000, 0x000c0000,
0x000c4000, 0x000c8000, 0x000cc000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d3[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00003e00, 0xffffffff, 0xffffffff, 0xffffffff, 0x00003f00,
0x00003400, 0x00003500, 0x00003600, 0x00003700, 0x00003800, 0x00003900,
0x00003a00, 0x00003b00, 0x00003c00, 0x00003d00, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00000100, 0x00000200, 0x00000300, 0x00000400, 0x00000500, 0x00000600,
0x00000700, 0x00000800, 0x00000900, 0x00000a00, 0x00000b00, 0x00000c00,
0x00000d00, 0x00000e00, 0x00000f00, 0x00001000, 0x00001100, 0x00001200,
0x00001300, 0x00001400, 0x00001500, 0x00001600, 0x00001700, 0x00001800,
0x00001900, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00001a00, 0x00001b00, 0x00001c00, 0x00001d00, 0x00001e00,
0x00001f00, 0x00002000, 0x00002100, 0x00002200, 0x00002300, 0x00002400,
0x00002500, 0x00002600, 0x00002700, 0x00002800, 0x00002900, 0x00002a00,
0x00002b00, 0x00002c00, 0x00002d00, 0x00002e00, 0x00002f00, 0x00003000,
0x00003100, 0x00003200, 0x00003300, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,40 @@
#include "tables.h"
const uint8_t
base64_table_enc_6bit[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
"+/";
// In the lookup table below, note that the value for '=' (character 61) is
// 254, not 255. This character is used for in-band signaling of the end of
// the datastream, and we will use that later. The characters A-Z, a-z, 0-9
// and + / are mapped to their "decoded" values. The other bytes all map to
// the value 255, which flags them as "invalid input".
const uint8_t
base64_table_dec_8bit[] =
{
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 0..15
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 16..31
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, // 32..47
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 254, 255, 255, // 48..63
255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 64..79
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, // 80..95
255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, // 96..111
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, // 112..127
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 128..143
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
};
#if BASE64_WORDSIZE >= 32
# include "table_dec_32bit.h"
# include "table_enc_12bit.h"
#endif

View file

@ -0,0 +1,23 @@
#ifndef BASE64_TABLES_H
#define BASE64_TABLES_H
#include <stdint.h>
#include "../env.h"
// These tables are used by all codecs for fallback plain encoding/decoding:
extern const uint8_t base64_table_enc_6bit[];
extern const uint8_t base64_table_dec_8bit[];
// These tables are used for the 32-bit and 64-bit generic decoders:
#if BASE64_WORDSIZE >= 32
extern const uint32_t base64_table_dec_32bit_d0[];
extern const uint32_t base64_table_dec_32bit_d1[];
extern const uint32_t base64_table_dec_32bit_d2[];
extern const uint32_t base64_table_dec_32bit_d3[];
// This table is used by the 32 and 64-bit generic encoders:
extern const uint16_t base64_table_enc_12bit[];
#endif
#endif // BASE64_TABLES_H

View file

@ -0,0 +1,69 @@
# This file must have the same content for mypyc/build_setup.py and lib-rt/build_setup.py,
# it exists to work around absence of support for per-file compile flags in setuptools.
# The version in mypyc/ is the source of truth, and should be copied to lib-rt if modified.
import os
import platform
import sys
try:
# Import setuptools so that it monkey-patch overrides distutils
import setuptools # noqa: F401
except ImportError:
pass
if sys.version_info >= (3, 12):
# From setuptools' monkeypatch
from distutils import ccompiler # type: ignore[import-not-found]
else:
from distutils import ccompiler
EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT = {
"msvc": {
"base64/arch/sse42": ["/arch:SSE4.2"],
"base64/arch/avx2": ["/arch:AVX2"],
"base64/arch/avx": ["/arch:AVX"],
}
}
ccompiler.CCompiler.__spawn = ccompiler.CCompiler.spawn # type: ignore[attr-defined]
X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64")
PYODIDE = "PYODIDE" in os.environ
NO_EXTRA_FLAGS = "MYPYC_NO_EXTRA_FLAGS" in os.environ
def spawn(self, cmd, **kwargs) -> None: # type: ignore[no-untyped-def]
new_cmd = list(cmd)
if PYODIDE:
for argument in reversed(new_cmd):
if not str(argument).endswith(".c"):
continue
if "base64/arch/" in str(argument):
new_cmd.extend(["-msimd128"])
elif not NO_EXTRA_FLAGS:
compiler_type: str = self.compiler_type
extra_options = EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT.get(compiler_type, None)
if X86_64 and extra_options is not None:
# filenames are closer to the end of command line
for argument in reversed(new_cmd):
# Check if the matching argument contains a source filename.
if not str(argument).endswith(".c"):
continue
for path in extra_options.keys():
if path in str(argument):
if compiler_type == "bcpp":
compiler = new_cmd.pop()
# Borland accepts a source file name at the end,
# insert the options before it
new_cmd.extend(extra_options[path])
new_cmd.append(compiler)
else:
new_cmd.extend(extra_options[path])
# path component is found, no need to search any further
break
self.__spawn(new_cmd, **kwargs)
ccompiler.CCompiler.spawn = spawn # type: ignore[method-assign]

View file

@ -0,0 +1,5 @@
#include "bytearray_extra_ops.h"
PyObject *CPyByteArray_New(void) {
return PyByteArray_FromStringAndSize(NULL, 0);
}

View file

@ -0,0 +1,10 @@
#ifndef MYPYC_BYTEARRAY_EXTRA_OPS_H
#define MYPYC_BYTEARRAY_EXTRA_OPS_H
#include <Python.h>
#include "CPy.h"
// Construct empty bytearray
PyObject *CPyByteArray_New(void);
#endif

View file

@ -0,0 +1,45 @@
#include "bytes_extra_ops.h"
PyObject *CPyBytes_Translate(PyObject *bytes, PyObject *table) {
// Fast path: exact bytes object with exact bytes table
if (PyBytes_CheckExact(bytes) && PyBytes_CheckExact(table)) {
Py_ssize_t table_len = PyBytes_GET_SIZE(table);
if (table_len != 256) {
PyErr_SetString(PyExc_ValueError,
"translation table must be 256 characters long");
return NULL;
}
Py_ssize_t len = PyBytes_GET_SIZE(bytes);
const char *input = PyBytes_AS_STRING(bytes);
const char *trans_table = PyBytes_AS_STRING(table);
PyObject *result = PyBytes_FromStringAndSize(NULL, len);
if (result == NULL) {
return NULL;
}
char *output = PyBytes_AS_STRING(result);
bool changed = false;
// Without a loop unrolling hint performance can be worse than CPython
CPY_UNROLL_LOOP(4)
for (Py_ssize_t i = len; --i >= 0;) {
char c = *input++;
if ((*output++ = trans_table[(unsigned char)c]) != c)
changed = true;
}
// If nothing changed, discard result and return the original object
if (!changed) {
Py_DECREF(result);
Py_INCREF(bytes);
return bytes;
}
return result;
}
// Fallback to Python method call for non-exact types or non-standard tables
return PyObject_CallMethodOneArg(bytes, mypyc_interned_str.translate, table);
}

View file

@ -0,0 +1,31 @@
#ifndef MYPYC_BYTES_EXTRA_OPS_H
#define MYPYC_BYTES_EXTRA_OPS_H
#include <Python.h>
#include <stdint.h>
#include "CPy.h"
// Optimized bytes translate operation
PyObject *CPyBytes_Translate(PyObject *bytes, PyObject *table);
// Optimized bytes.__getitem__ operations
// If index is negative, convert to non-negative index (no range checking)
static inline int64_t CPyBytes_AdjustIndex(PyObject *obj, int64_t index) {
if (index < 0) {
return index + Py_SIZE(obj);
}
return index;
}
// Check if index is in valid range [0, len)
static inline bool CPyBytes_RangeCheck(PyObject *obj, int64_t index) {
return index >= 0 && index < Py_SIZE(obj);
}
// Get byte at index (no bounds checking) - returns as CPyTagged
static inline CPyTagged CPyBytes_GetItemUnsafe(PyObject *obj, int64_t index) {
return ((CPyTagged)(uint8_t)(PyBytes_AS_STRING(obj))[index]) << 1;
}
#endif

View file

@ -0,0 +1,218 @@
// Bytes primitive operations
//
// These are registered in mypyc.primitives.bytes_ops.
#include <Python.h>
#include "CPy.h"
// Returns -1 on error, 0 on inequality, 1 on equality.
//
// Falls back to PyObject_RichCompareBool.
int CPyBytes_Compare(PyObject *left, PyObject *right) {
if (PyBytes_CheckExact(left) && PyBytes_CheckExact(right)) {
if (left == right) {
return 1;
}
// Adapted from cpython internal implementation of bytes_compare.
Py_ssize_t len = Py_SIZE(left);
if (Py_SIZE(right) != len) {
return 0;
}
PyBytesObject *left_b = (PyBytesObject *)left;
PyBytesObject *right_b = (PyBytesObject *)right;
if (left_b->ob_sval[0] != right_b->ob_sval[0]) {
return 0;
}
return memcmp(left_b->ob_sval, right_b->ob_sval, len) == 0;
}
return PyObject_RichCompareBool(left, right, Py_EQ);
}
CPyTagged CPyBytes_GetItem(PyObject *o, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = ((PyVarObject *)o)->ob_size;
if (n < 0)
n += size;
if (n < 0 || n >= size) {
PyErr_SetString(PyExc_IndexError, "index out of range");
return CPY_INT_TAG;
}
unsigned char num = PyBytes_Check(o) ? ((PyBytesObject *)o)->ob_sval[n]
: ((PyByteArrayObject *)o)->ob_bytes[n];
return num << 1;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return CPY_INT_TAG;
}
}
PyObject *CPyBytes_Concat(PyObject *a, PyObject *b) {
Py_ssize_t a_len = ((PyVarObject *)a)->ob_size;
Py_ssize_t b_len = ((PyVarObject *)b)->ob_size;
PyBytesObject *ret = (PyBytesObject *)PyBytes_FromStringAndSize(NULL, a_len + b_len);
if (ret != NULL) {
memcpy(ret->ob_sval, ((PyBytesObject *)a)->ob_sval, a_len);
memcpy(ret->ob_sval + a_len, ((PyBytesObject *)b)->ob_sval, b_len);
}
return (PyObject *)ret;
}
static inline Py_ssize_t Clamp(Py_ssize_t a, Py_ssize_t b, Py_ssize_t c) {
return a < b ? b : (a >= c ? c : a);
}
PyObject *CPyBytes_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end)) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
Py_ssize_t len = ((PyVarObject *)obj)->ob_size;
if (startn < 0) {
startn += len;
}
if (endn < 0) {
endn += len;
}
startn = Clamp(startn, 0, len);
endn = Clamp(endn, 0, len);
Py_ssize_t slice_len = endn - startn;
if (PyBytes_Check(obj)) {
return PyBytes_FromStringAndSize(PyBytes_AS_STRING(obj) + startn, slice_len);
} else {
return PyByteArray_FromStringAndSize(PyByteArray_AS_STRING(obj) + startn, slice_len);
}
}
return CPyObject_GetSlice(obj, start, end);
}
// Like _PyBytes_Join but fallback to dynamic call if 'sep' is not bytes
// (mostly commonly, for bytearrays)
PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter) {
if (PyBytes_CheckExact(sep)) {
return PyBytes_Join(sep, iter);
} else {
return PyObject_CallMethodOneArg(sep, mypyc_interned_str.join, iter);
}
}
PyObject *CPyBytes_Build(Py_ssize_t len, ...) {
Py_ssize_t i;
Py_ssize_t sz = 0;
va_list args;
va_start(args, len);
for (i = 0; i < len; i++) {
PyObject *item = va_arg(args, PyObject *);
size_t add_sz = ((PyVarObject *)item)->ob_size;
// Using size_t to avoid overflow during arithmetic calculation
if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
PyErr_SetString(PyExc_OverflowError,
"join() result is too long for a Python bytes");
return NULL;
}
sz += add_sz;
}
va_end(args);
PyBytesObject *ret = (PyBytesObject *)PyBytes_FromStringAndSize(NULL, sz);
if (ret != NULL) {
char *res_data = ret->ob_sval;
va_start(args, len);
for (i = 0; i < len; i++) {
PyObject *item = va_arg(args, PyObject *);
Py_ssize_t item_sz = ((PyVarObject *)item)->ob_size;
memcpy(res_data, ((PyBytesObject *)item)->ob_sval, item_sz);
res_data += item_sz;
}
va_end(args);
assert(res_data == ret->ob_sval + ((PyVarObject *)ret)->ob_size);
}
return (PyObject *)ret;
}
CPyTagged CPyBytes_Ord(PyObject *obj) {
Py_ssize_t s = PyBytes_GET_SIZE(obj);
if (s == 1) {
return (unsigned char)(PyBytes_AS_STRING(obj)[0]) << 1;
}
PyErr_SetString(PyExc_TypeError, "ord() expects a character");
return CPY_INT_TAG;
}
PyObject *CPyBytes_Multiply(PyObject *bytes, CPyTagged count) {
Py_ssize_t temp_count = CPyTagged_AsSsize_t(count);
if (temp_count == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PySequence_Repeat(bytes, temp_count);
}
int CPyBytes_Startswith(PyObject *self, PyObject *subobj) {
if (PyBytes_CheckExact(self) && PyBytes_CheckExact(subobj)) {
if (self == subobj) {
return 1;
}
Py_ssize_t subobj_len = PyBytes_GET_SIZE(subobj);
if (subobj_len == 0) {
return 1;
}
Py_ssize_t self_len = PyBytes_GET_SIZE(self);
if (subobj_len > self_len) {
return 0;
}
const char *self_buf = PyBytes_AS_STRING(self);
const char *subobj_buf = PyBytes_AS_STRING(subobj);
return memcmp(self_buf, subobj_buf, (size_t)subobj_len) == 0 ? 1 : 0;
}
PyObject *result = PyObject_CallMethodOneArg(self, mypyc_interned_str.startswith, subobj);
if (result == NULL) {
return 2;
}
int ret = PyObject_IsTrue(result);
Py_DECREF(result);
if (ret < 0) {
return 2;
}
return ret;
}
int CPyBytes_Endswith(PyObject *self, PyObject *subobj) {
if (PyBytes_CheckExact(self) && PyBytes_CheckExact(subobj)) {
if (self == subobj) {
return 1;
}
Py_ssize_t subobj_len = PyBytes_GET_SIZE(subobj);
if (subobj_len == 0) {
return 1;
}
Py_ssize_t self_len = PyBytes_GET_SIZE(self);
if (subobj_len > self_len) {
return 0;
}
const char *self_buf = PyBytes_AS_STRING(self);
const char *subobj_buf = PyBytes_AS_STRING(subobj);
return memcmp(self_buf + (self_len - subobj_len), subobj_buf, (size_t)subobj_len) == 0 ? 1 : 0;
}
PyObject *result = PyObject_CallMethodOneArg(self, mypyc_interned_str.endswith, subobj);
if (result == NULL) {
return 2;
}
int ret = PyObject_IsTrue(result);
Py_DECREF(result);
if (ret < 0) {
return 2;
}
return ret;
}

View file

@ -0,0 +1,45 @@
// Primitives related to librt.strings.BytesWriter that get linked statically
// with compiled modules, instead of being called via a capsule.
#include "byteswriter_extra_ops.h"
#ifdef MYPYC_EXPERIMENTAL
char CPyBytesWriter_Write(PyObject *obj, PyObject *value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
const char *data;
Py_ssize_t size;
if (likely(PyBytes_Check(value))) {
data = PyBytes_AS_STRING(value);
size = PyBytes_GET_SIZE(value);
} else {
data = PyByteArray_AS_STRING(value);
size = PyByteArray_GET_SIZE(value);
}
// Write bytes content.
if (!CPyBytesWriter_EnsureSize(self, size))
return CPY_NONE_ERROR;
if (size < 8) {
// Loop tends to be faster for small sizes
char *p = self->buf + self->len;
for (Py_ssize_t i = 0; i < size; i++) {
p[i] = data[i];
}
} else {
memcpy(self->buf + self->len, data, size);
}
self->len += size;
return CPY_NONE;
}
void CPyBytes_ReadError(int64_t index, Py_ssize_t size) {
if (index < 0) {
PyErr_SetString(PyExc_ValueError, "index must be non-negative");
} else {
PyErr_Format(PyExc_IndexError,
"index %lld out of range for bytes of length %zd",
(long long)index, size);
}
}
#endif // MYPYC_EXPERIMENTAL

View file

@ -0,0 +1,291 @@
#ifndef BYTESWRITER_EXTRA_OPS_H
#define BYTESWRITER_EXTRA_OPS_H
#ifdef MYPYC_EXPERIMENTAL
#include <stdbool.h>
#include <stdint.h>
#include <Python.h>
#include "mypyc_util.h"
#include "strings/librt_strings.h"
#include "strings/librt_strings_common.h"
// BytesWriter: Length and capacity
static inline CPyTagged
CPyBytesWriter_Len(PyObject *obj) {
return (CPyTagged)((BytesWriterObject *)obj)->len << 1;
}
static inline bool
CPyBytesWriter_EnsureSize(BytesWriterObject *data, Py_ssize_t n) {
if (likely(data->capacity - data->len >= n)) {
return true;
} else {
return LibRTStrings_ByteWriter_grow_buffer_internal(data, n);
}
}
// BytesWriter: Basic write operations
static inline char
CPyBytesWriter_Append(PyObject *obj, uint8_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
// Store length in a local variable to enable additional optimizations
Py_ssize_t len = self->len;
if (!CPyBytesWriter_EnsureSize(self, 1))
return CPY_NONE_ERROR;
self->buf[len] = value;
self->len = len + 1;
return CPY_NONE;
}
char CPyBytesWriter_Write(PyObject *obj, PyObject *value);
// BytesWriter: Indexing operations
// If index is negative, convert to non-negative index (no range checking)
static inline int64_t CPyBytesWriter_AdjustIndex(PyObject *obj, int64_t index) {
if (index < 0) {
return index + ((BytesWriterObject *)obj)->len;
}
return index;
}
static inline bool CPyBytesWriter_RangeCheck(PyObject *obj, int64_t index) {
return index >= 0 && index < ((BytesWriterObject *)obj)->len;
}
static inline uint8_t CPyBytesWriter_GetItem(PyObject *obj, int64_t index) {
return (((BytesWriterObject *)obj)->buf)[index];
}
static inline void CPyBytesWriter_SetItem(PyObject *obj, int64_t index, uint8_t x) {
(((BytesWriterObject *)obj)->buf)[index] = x;
}
// BytesWriter: Write integer operations (little-endian)
static inline char
CPyBytesWriter_WriteI16LE(PyObject *obj, int16_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 2))
return CPY_NONE_ERROR;
BytesWriter_WriteI16LEUnsafe(self, value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteI16BE(PyObject *obj, int16_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 2))
return CPY_NONE_ERROR;
BytesWriter_WriteI16BEUnsafe(self, value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteI32LE(PyObject *obj, int32_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 4))
return CPY_NONE_ERROR;
BytesWriter_WriteI32LEUnsafe(self, value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteI32BE(PyObject *obj, int32_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 4))
return CPY_NONE_ERROR;
BytesWriter_WriteI32BEUnsafe(self, value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteI64LE(PyObject *obj, int64_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 8))
return CPY_NONE_ERROR;
BytesWriter_WriteI64LEUnsafe(self, value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteI64BE(PyObject *obj, int64_t value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 8))
return CPY_NONE_ERROR;
BytesWriter_WriteI64BEUnsafe(self, value);
return CPY_NONE;
}
// BytesWriter: Write float operations
static inline char
CPyBytesWriter_WriteF32LE(PyObject *obj, double value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 4))
return CPY_NONE_ERROR;
BytesWriter_WriteF32LEUnsafe(self, (float)value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteF32BE(PyObject *obj, double value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 4))
return CPY_NONE_ERROR;
BytesWriter_WriteF32BEUnsafe(self, (float)value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteF64LE(PyObject *obj, double value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 8))
return CPY_NONE_ERROR;
BytesWriter_WriteF64LEUnsafe(self, value);
return CPY_NONE;
}
static inline char
CPyBytesWriter_WriteF64BE(PyObject *obj, double value) {
BytesWriterObject *self = (BytesWriterObject *)obj;
if (!CPyBytesWriter_EnsureSize(self, 8))
return CPY_NONE_ERROR;
BytesWriter_WriteF64BEUnsafe(self, value);
return CPY_NONE;
}
// Bytes: Read integer operations
// Helper function for bytes read error handling (negative index or out of range)
void CPyBytes_ReadError(int64_t index, Py_ssize_t size);
static inline int16_t
CPyBytes_ReadI16LE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 2)) {
CPyBytes_ReadError(index, size);
return CPY_LL_INT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadI16LEUnsafe(data + index);
}
static inline int16_t
CPyBytes_ReadI16BE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 2)) {
CPyBytes_ReadError(index, size);
return CPY_LL_INT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadI16BEUnsafe(data + index);
}
static inline int32_t
CPyBytes_ReadI32BE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 4)) {
CPyBytes_ReadError(index, size);
return CPY_LL_INT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadI32BEUnsafe(data + index);
}
static inline int32_t
CPyBytes_ReadI32LE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 4)) {
CPyBytes_ReadError(index, size);
return CPY_LL_INT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadI32LEUnsafe(data + index);
}
static inline int64_t
CPyBytes_ReadI64LE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 8)) {
CPyBytes_ReadError(index, size);
return CPY_LL_INT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadI64LEUnsafe(data + index);
}
static inline int64_t
CPyBytes_ReadI64BE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 8)) {
CPyBytes_ReadError(index, size);
return CPY_LL_INT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadI64BEUnsafe(data + index);
}
// Bytes: Read float operations
static inline double
CPyBytes_ReadF32LE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 4)) {
CPyBytes_ReadError(index, size);
return CPY_FLOAT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return (double)CPyBytes_ReadF32LEUnsafe(data + index);
}
static inline double
CPyBytes_ReadF32BE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 4)) {
CPyBytes_ReadError(index, size);
return CPY_FLOAT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return (double)CPyBytes_ReadF32BEUnsafe(data + index);
}
static inline double
CPyBytes_ReadF64LE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 8)) {
CPyBytes_ReadError(index, size);
return CPY_FLOAT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadF64LEUnsafe(data + index);
}
static inline double
CPyBytes_ReadF64BE(PyObject *bytes_obj, int64_t index) {
// bytes_obj type is enforced by mypyc
Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj);
if (unlikely(index < 0 || index > size - 8)) {
CPyBytes_ReadError(index, size);
return CPY_FLOAT_ERROR;
}
const unsigned char *data = (const unsigned char *)PyBytes_AS_STRING(bytes_obj);
return CPyBytes_ReadF64BEUnsafe(data + index);
}
#endif // MYPYC_EXPERIMENTAL
#endif

View file

@ -0,0 +1,429 @@
// Dict primitive operations
//
// These are registered in mypyc.primitives.dict_ops.
#include <Python.h>
#include "CPy.h"
#ifndef Py_TPFLAGS_MAPPING
#define Py_TPFLAGS_MAPPING (1 << 6)
#endif
// Dict subclasses like defaultdict override things in interesting
// ways, so we don't want to just directly use the dict methods. Not
// sure if it is actually worth doing all this stuff, but it saves
// some indirections.
PyObject *CPyDict_GetItem(PyObject *dict, PyObject *key) {
if (PyDict_CheckExact(dict)) {
PyObject *res = PyDict_GetItemWithError(dict, key);
if (!res) {
if (!PyErr_Occurred()) {
PyErr_SetObject(PyExc_KeyError, key);
}
} else {
Py_INCREF(res);
}
return res;
} else {
return PyObject_GetItem(dict, key);
}
}
PyObject *CPyDict_Build(Py_ssize_t size, ...) {
Py_ssize_t i;
PyObject *res = _PyDict_NewPresized(size);
if (res == NULL) {
return NULL;
}
va_list args;
va_start(args, size);
for (i = 0; i < size; i++) {
PyObject *key = va_arg(args, PyObject *);
PyObject *value = va_arg(args, PyObject *);
if (PyDict_SetItem(res, key, value)) {
Py_DECREF(res);
return NULL;
}
}
va_end(args);
return res;
}
PyObject *CPyDict_Get(PyObject *dict, PyObject *key, PyObject *fallback) {
// We are dodgily assuming that get on a subclass doesn't have
// different behavior.
PyObject *res = PyDict_GetItemWithError(dict, key);
if (!res) {
if (PyErr_Occurred()) {
return NULL;
}
res = fallback;
}
Py_INCREF(res);
return res;
}
PyObject *CPyDict_GetWithNone(PyObject *dict, PyObject *key) {
return CPyDict_Get(dict, key, Py_None);
}
PyObject *CPyDict_SetDefault(PyObject *dict, PyObject *key, PyObject *value) {
if (PyDict_CheckExact(dict)) {
PyObject* ret = PyDict_SetDefault(dict, key, value);
Py_XINCREF(ret);
return ret;
}
return PyObject_CallMethodObjArgs(dict, mypyc_interned_str.setdefault, key, value, NULL);
}
PyObject *CPyDict_SetDefaultWithNone(PyObject *dict, PyObject *key) {
return CPyDict_SetDefault(dict, key, Py_None);
}
PyObject *CPyDict_SetDefaultWithEmptyDatatype(PyObject *dict, PyObject *key,
int data_type) {
PyObject *res = CPyDict_GetItem(dict, key);
if (!res) {
// CPyDict_GetItem() would generates a PyExc_KeyError
// when key is not found.
PyErr_Clear();
PyObject *new_obj;
if (data_type == 1) {
new_obj = PyList_New(0);
} else if (data_type == 2) {
new_obj = PyDict_New();
} else if (data_type == 3) {
new_obj = PySet_New(NULL);
} else {
return NULL;
}
if (CPyDict_SetItem(dict, key, new_obj) == -1) {
return NULL;
} else {
return new_obj;
}
} else {
return res;
}
}
int CPyDict_SetItem(PyObject *dict, PyObject *key, PyObject *value) {
if (PyDict_CheckExact(dict)) {
return PyDict_SetItem(dict, key, value);
} else {
return PyObject_SetItem(dict, key, value);
}
}
static inline int CPy_ObjectToStatus(PyObject *obj) {
if (obj) {
Py_DECREF(obj);
return 0;
} else {
return -1;
}
}
static int CPyDict_UpdateGeneral(PyObject *dict, PyObject *stuff) {
PyObject *res = PyObject_CallMethodOneArg(dict, mypyc_interned_str.update, stuff);
return CPy_ObjectToStatus(res);
}
int CPyDict_UpdateInDisplay(PyObject *dict, PyObject *stuff) {
// from https://github.com/python/cpython/blob/55d035113dfb1bd90495c8571758f504ae8d4802/Python/ceval.c#L2710
int ret = PyDict_Update(dict, stuff);
if (ret < 0) {
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Format(PyExc_TypeError,
"'%.200s' object is not a mapping",
Py_TYPE(stuff)->tp_name);
}
}
return ret;
}
int CPyDict_Update(PyObject *dict, PyObject *stuff) {
if (PyDict_CheckExact(dict)) {
return PyDict_Update(dict, stuff);
} else {
return CPyDict_UpdateGeneral(dict, stuff);
}
}
int CPyDict_UpdateFromAny(PyObject *dict, PyObject *stuff) {
if (PyDict_CheckExact(dict)) {
// Argh this sucks
if (PyDict_Check(stuff) || PyObject_HasAttrWithError(stuff, mypyc_interned_str.keys) > 0) {
return PyDict_Update(dict, stuff);
} else {
return PyDict_MergeFromSeq2(dict, stuff, 1);
}
} else {
return CPyDict_UpdateGeneral(dict, stuff);
}
}
PyObject *CPyDict_FromAny(PyObject *obj) {
if (PyDict_Check(obj)) {
return PyDict_Copy(obj);
} else {
int res;
PyObject *dict = PyDict_New();
if (!dict) {
return NULL;
}
if (PyObject_HasAttrWithError(obj, mypyc_interned_str.keys) > 0) {
res = PyDict_Update(dict, obj);
} else {
res = PyDict_MergeFromSeq2(dict, obj, 1);
}
if (res < 0) {
Py_DECREF(dict);
return NULL;
}
return dict;
}
}
PyObject *CPyDict_KeysView(PyObject *dict) {
if (PyDict_CheckExact(dict)){
return _CPyDictView_New(dict, &PyDictKeys_Type);
}
return PyObject_CallMethodNoArgs(dict, mypyc_interned_str.keys);
}
PyObject *CPyDict_ValuesView(PyObject *dict) {
if (PyDict_CheckExact(dict)){
return _CPyDictView_New(dict, &PyDictValues_Type);
}
return PyObject_CallMethodNoArgs(dict, mypyc_interned_str.values);
}
PyObject *CPyDict_ItemsView(PyObject *dict) {
if (PyDict_CheckExact(dict)){
return _CPyDictView_New(dict, &PyDictItems_Type);
}
return PyObject_CallMethodNoArgs(dict, mypyc_interned_str.items);
}
PyObject *CPyDict_Keys(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Keys(dict);
}
// Inline generic fallback logic to also return a list.
PyObject *list = PyList_New(0);
PyObject *view = PyObject_CallMethodNoArgs(dict, mypyc_interned_str.keys);
if (view == NULL) {
return NULL;
}
int res = PyList_Extend(list, view);
Py_DECREF(view);
if (res < 0) {
return NULL;
}
return list;
}
PyObject *CPyDict_Values(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Values(dict);
}
// Inline generic fallback logic to also return a list.
PyObject *list = PyList_New(0);
PyObject *view = PyObject_CallMethodNoArgs(dict, mypyc_interned_str.values);
if (view == NULL) {
return NULL;
}
int res = PyList_Extend(list, view);
Py_DECREF(view);
if (res < 0) {
return NULL;
}
return list;
}
PyObject *CPyDict_Items(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Items(dict);
}
// Inline generic fallback logic to also return a list.
PyObject *list = PyList_New(0);
PyObject *view = PyObject_CallMethodNoArgs(dict, mypyc_interned_str.items);
if (view == NULL) {
return NULL;
}
int res = PyList_Extend(list, view);
Py_DECREF(view);
if (res < 0) {
return NULL;
}
return list;
}
char CPyDict_Clear(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
PyDict_Clear(dict);
} else {
PyObject *res = PyObject_CallMethodNoArgs(dict, mypyc_interned_str.clear);
if (res == NULL) {
return 0;
}
}
return 1;
}
PyObject *CPyDict_Copy(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Copy(dict);
}
return PyObject_CallMethodNoArgs(dict, mypyc_interned_str.copy);
}
PyObject *CPyDict_GetKeysIter(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
// Return dict itself to indicate we can use fast path instead.
Py_INCREF(dict);
return dict;
}
return PyObject_GetIter(dict);
}
PyObject *CPyDict_GetItemsIter(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
// Return dict itself to indicate we can use fast path instead.
Py_INCREF(dict);
return dict;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, mypyc_interned_str.items);
if (view == NULL) {
return NULL;
}
PyObject *iter = PyObject_GetIter(view);
Py_DECREF(view);
return iter;
}
PyObject *CPyDict_GetValuesIter(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
// Return dict itself to indicate we can use fast path instead.
Py_INCREF(dict);
return dict;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, mypyc_interned_str.values);
if (view == NULL) {
return NULL;
}
PyObject *iter = PyObject_GetIter(view);
Py_DECREF(view);
return iter;
}
static void _CPyDict_FromNext(tuple_T3CIO *ret, PyObject *dict_iter) {
// Get next item from iterator and set "should continue" flag.
ret->f2 = PyIter_Next(dict_iter);
if (ret->f2 == NULL) {
ret->f0 = 0;
Py_INCREF(Py_None);
ret->f2 = Py_None;
} else {
ret->f0 = 1;
}
}
// Helpers for fast dictionary iteration, return a single tuple
// instead of writing to multiple registers, for exact dicts use
// the fast path, and fall back to generic iterator logic for subclasses.
tuple_T3CIO CPyDict_NextKey(PyObject *dict_or_iter, CPyTagged offset) {
tuple_T3CIO ret;
Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
PyObject *dummy;
if (PyDict_CheckExact(dict_or_iter)) {
ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &ret.f2, &dummy);
if (ret.f0) {
ret.f1 = CPyTagged_FromSsize_t(py_offset);
} else {
// Set key to None, so mypyc can manage refcounts.
ret.f1 = 0;
ret.f2 = Py_None;
}
// PyDict_Next() returns borrowed references.
Py_INCREF(ret.f2);
} else {
// offset is dummy in this case, just use the old value.
ret.f1 = offset;
_CPyDict_FromNext(&ret, dict_or_iter);
}
return ret;
}
tuple_T3CIO CPyDict_NextValue(PyObject *dict_or_iter, CPyTagged offset) {
tuple_T3CIO ret;
Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
PyObject *dummy;
if (PyDict_CheckExact(dict_or_iter)) {
ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &dummy, &ret.f2);
if (ret.f0) {
ret.f1 = CPyTagged_FromSsize_t(py_offset);
} else {
// Set value to None, so mypyc can manage refcounts.
ret.f1 = 0;
ret.f2 = Py_None;
}
// PyDict_Next() returns borrowed references.
Py_INCREF(ret.f2);
} else {
// offset is dummy in this case, just use the old value.
ret.f1 = offset;
_CPyDict_FromNext(&ret, dict_or_iter);
}
return ret;
}
tuple_T4CIOO CPyDict_NextItem(PyObject *dict_or_iter, CPyTagged offset) {
tuple_T4CIOO ret;
Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
if (PyDict_CheckExact(dict_or_iter)) {
ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &ret.f2, &ret.f3);
if (ret.f0) {
ret.f1 = CPyTagged_FromSsize_t(py_offset);
} else {
// Set key and value to None, so mypyc can manage refcounts.
ret.f1 = 0;
ret.f2 = Py_None;
ret.f3 = Py_None;
}
} else {
ret.f1 = offset;
PyObject *item = PyIter_Next(dict_or_iter);
if (item == NULL || !PyTuple_Check(item) || PyTuple_GET_SIZE(item) != 2) {
if (item != NULL) {
PyErr_SetString(PyExc_TypeError, "a tuple of length 2 expected");
}
ret.f0 = 0;
ret.f2 = Py_None;
ret.f3 = Py_None;
} else {
ret.f0 = 1;
ret.f2 = PyTuple_GET_ITEM(item, 0);
ret.f3 = PyTuple_GET_ITEM(item, 1);
Py_DECREF(item);
}
}
// PyDict_Next() returns borrowed references.
Py_INCREF(ret.f2);
Py_INCREF(ret.f3);
return ret;
}
int CPyMapping_Check(PyObject *obj) {
return Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MAPPING;
}

View file

@ -0,0 +1,261 @@
#include "pythoncapi_compat.h"
// Exception related primitive operations
//
// These are registered in mypyc.primitives.exc_ops.
#include <Python.h>
#include "CPy.h"
void CPy_Raise(PyObject *exc) {
if (PyObject_IsInstance(exc, (PyObject *)&PyType_Type)) {
PyObject *obj = PyObject_CallNoArgs(exc);
if (!obj)
return;
PyErr_SetObject(exc, obj);
Py_DECREF(obj);
} else {
PyErr_SetObject((PyObject *)Py_TYPE(exc), exc);
}
}
void CPy_Reraise(void) {
PyObject *p_type, *p_value, *p_traceback;
PyErr_GetExcInfo(&p_type, &p_value, &p_traceback);
PyErr_Restore(p_type, p_value, p_traceback);
}
void CPyErr_SetObjectAndTraceback(PyObject *type, PyObject *value, PyObject *traceback) {
if (!PyType_Check(type) && Py_IsNone(value)) {
// The first argument must be an exception instance
value = type;
type = (PyObject *)Py_TYPE(value);
}
// Set the value and traceback of an error. Because calling
// PyErr_Restore takes away a reference to each object passed in
// as an argument, we manually increase the reference count of
// each argument before calling it.
Py_INCREF(type);
Py_INCREF(value);
Py_INCREF(traceback);
PyErr_Restore(type, value, traceback);
}
tuple_T3OOO CPy_CatchError(void) {
// We need to return the existing sys.exc_info() information, so
// that it can be restored when we finish handling the error we
// are catching now. Grab that triple and convert NULL values to
// the ExcDummy object in order to simplify refcount handling in
// generated code.
tuple_T3OOO ret;
PyErr_GetExcInfo(&ret.f0, &ret.f1, &ret.f2);
_CPy_ToDummy(&ret.f0);
_CPy_ToDummy(&ret.f1);
_CPy_ToDummy(&ret.f2);
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_RuntimeError, "CPy_CatchError called with no error!");
}
// Retrieve the error info and normalize it so that it looks like
// what python code needs it to be.
PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback);
// Could we avoid always normalizing?
PyErr_NormalizeException(&type, &value, &traceback);
if (traceback != NULL) {
PyException_SetTraceback(value, traceback);
}
// Indicate that we are now handling this exception by stashing it
// in sys.exc_info(). mypyc routines that need access to the
// exception will read it out of there.
PyErr_SetExcInfo(type, value, traceback);
// Clear the error indicator, since the exception isn't
// propagating anymore.
PyErr_Clear();
return ret;
}
void CPy_RestoreExcInfo(tuple_T3OOO info) {
PyErr_SetExcInfo(_CPy_FromDummy(info.f0), _CPy_FromDummy(info.f1), _CPy_FromDummy(info.f2));
}
bool CPy_ExceptionMatches(PyObject *type) {
return PyErr_GivenExceptionMatches((PyObject *)Py_TYPE(CPy_ExcState()->exc_value), type);
}
PyObject *CPy_GetExcValue(void) {
PyObject *exc = CPy_ExcState()->exc_value;
Py_INCREF(exc);
return exc;
}
static inline void _CPy_ToNone(PyObject **p) {
if (*p == NULL) {
Py_INCREF(Py_None);
*p = Py_None;
}
}
void _CPy_GetExcInfo(PyObject **p_type, PyObject **p_value, PyObject **p_traceback) {
PyErr_GetExcInfo(p_type, p_value, p_traceback);
_CPy_ToNone(p_type);
_CPy_ToNone(p_value);
_CPy_ToNone(p_traceback);
}
tuple_T3OOO CPy_GetExcInfo(void) {
tuple_T3OOO ret;
_CPy_GetExcInfo(&ret.f0, &ret.f1, &ret.f2);
return ret;
}
void CPyError_OutOfMemory(void) {
fprintf(stderr, "fatal: out of memory\n");
fflush(stderr);
abort();
}
// Construct a nicely formatted type name based on __module__ and __name__.
static PyObject *CPy_GetTypeName(PyObject *type) {
PyObject *module = NULL, *name = NULL;
PyObject *full = NULL;
module = PyObject_GetAttr(type, mypyc_interned_str.__module__);
if (!module || !PyUnicode_Check(module)) {
goto out;
}
name = PyObject_GetAttr(type, mypyc_interned_str.__qualname__);
if (!name || !PyUnicode_Check(name)) {
goto out;
}
if (PyUnicode_CompareWithASCIIString(module, "builtins") == 0) {
Py_INCREF(name);
full = name;
} else {
full = PyUnicode_FromFormat("%U.%U", module, name);
}
out:
Py_XDECREF(module);
Py_XDECREF(name);
return full;
}
// Get the type of a value as a string, expanding tuples to include
// all the element types.
static PyObject *CPy_FormatTypeName(PyObject *value) {
if (Py_IsNone(value)) {
return PyUnicode_FromString("None");
}
if (!PyTuple_CheckExact(value)) {
return CPy_GetTypeName((PyObject *)Py_TYPE(value));
}
if (PyTuple_GET_SIZE(value) > 10) {
return PyUnicode_FromFormat("tuple[<%d items>]", PyTuple_GET_SIZE(value));
}
// Most of the logic is all for tuples, which is the only interesting case
PyObject *output = PyUnicode_FromString("tuple[");
if (!output) {
return NULL;
}
/* This is quadratic but if that ever matters something is really weird. */
int i;
for (i = 0; i < PyTuple_GET_SIZE(value); i++) {
PyObject *s = CPy_FormatTypeName(PyTuple_GET_ITEM(value, i));
if (!s) {
Py_DECREF(output);
return NULL;
}
PyObject *next = PyUnicode_FromFormat("%U%U%s", output, s,
i + 1 == PyTuple_GET_SIZE(value) ? "]" : ", ");
Py_DECREF(output);
Py_DECREF(s);
if (!next) {
return NULL;
}
output = next;
}
return output;
}
CPy_NOINLINE
void CPy_TypeError(const char *expected, PyObject *value) {
PyObject *out = CPy_FormatTypeName(value);
if (out) {
PyErr_Format(PyExc_TypeError, "%s object expected; got %U", expected, out);
Py_DECREF(out);
} else {
PyErr_Format(PyExc_TypeError, "%s object expected; and errored formatting real type!",
expected);
}
}
// The PyFrameObject type definition (struct _frame) has been moved
// to the internal C API: to the pycore_frame.h header file.
// https://github.com/python/cpython/pull/31530
#if PY_VERSION_HEX >= 0x030b00a6
#include "internal/pycore_frame.h"
#endif
// This function is basically exactly the same with _PyTraceback_Add
// which is available in all the versions we support.
// We're continuing to use this because we'll probably optimize this later.
void CPy_AddTraceback(const char *filename, const char *funcname, int line, PyObject *globals) {
PyObject *exc, *val, *tb;
PyThreadState *thread_state = PyThreadState_GET();
PyFrameObject *frame_obj;
// We need to save off the exception state because in 3.8,
// PyFrame_New fails if there is an error set and it fails to look
// up builtins in the globals. (_PyTraceback_Add documents that it
// needs to do it because it decodes the filename according to the
// FS encoding, which could have a decoder in Python. We don't do
// that so *that* doesn't apply to us.)
PyErr_Fetch(&exc, &val, &tb);
PyCodeObject *code_obj = PyCode_NewEmpty(filename, funcname, line);
if (code_obj == NULL) {
goto error;
}
frame_obj = PyFrame_New(thread_state, code_obj, globals, 0);
if (frame_obj == NULL) {
Py_DECREF(code_obj);
goto error;
}
frame_obj->f_lineno = line;
PyErr_Restore(exc, val, tb);
PyTraceBack_Here(frame_obj);
Py_DECREF(code_obj);
Py_DECREF(frame_obj);
return;
error:
#if CPY_3_12_FEATURES
_PyErr_ChainExceptions1(exc);
#else
_PyErr_ChainExceptions(exc, val, tb);
#endif
}
CPy_NOINLINE
void CPy_TypeErrorTraceback(const char *filename, const char *funcname, int line,
PyObject *globals, const char *expected, PyObject *value) {
CPy_TypeError(expected, value);
CPy_AddTraceback(filename, funcname, line, globals);
}
void CPy_AttributeError(const char *filename, const char *funcname, const char *classname,
const char *attrname, int line, PyObject *globals) {
char buf[500];
snprintf(buf, sizeof(buf), "attribute '%.200s' of '%.200s' undefined", attrname, classname);
PyErr_SetString(PyExc_AttributeError, buf);
CPy_AddTraceback(filename, funcname, line, globals);
}

View file

@ -0,0 +1,239 @@
// Float primitive operations
//
// These are registered in mypyc.primitives.float_ops.
#include <Python.h>
#include "CPy.h"
static double CPy_DomainError(void) {
PyErr_SetString(PyExc_ValueError, "math domain error");
return CPY_FLOAT_ERROR;
}
static double CPy_MathRangeError(void) {
PyErr_SetString(PyExc_OverflowError, "math range error");
return CPY_FLOAT_ERROR;
}
static double CPy_MathExpectedNonNegativeInputError(double x) {
char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
if (buf) {
PyErr_Format(PyExc_ValueError, "expected a nonnegative input, got %s", buf);
PyMem_Free(buf);
}
return CPY_FLOAT_ERROR;
}
static double CPy_MathExpectedPositiveInputError(double x) {
char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
if (buf) {
PyErr_Format(PyExc_ValueError, "expected a positive input, got %s", buf);
PyMem_Free(buf);
}
return CPY_FLOAT_ERROR;
}
static double CPy_MathExpectedFiniteInput(double x) {
char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
if (buf) {
PyErr_Format(PyExc_ValueError, "expected a finite input, got %s", buf);
PyMem_Free(buf);
}
return CPY_FLOAT_ERROR;
}
double CPyFloat_FromTagged(CPyTagged x) {
if (CPyTagged_CheckShort(x)) {
return CPyTagged_ShortAsSsize_t(x);
}
double result = PyFloat_AsDouble(CPyTagged_LongAsObject(x));
if (unlikely(result == -1.0) && PyErr_Occurred()) {
return CPY_FLOAT_ERROR;
}
return result;
}
double CPyFloat_Sin(double x) {
double v = sin(x);
if (unlikely(isnan(v)) && !isnan(x)) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedFiniteInput(x);
#else
return CPy_DomainError();
#endif
}
return v;
}
double CPyFloat_Cos(double x) {
double v = cos(x);
if (unlikely(isnan(v)) && !isnan(x)) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedFiniteInput(x);
#else
return CPy_DomainError();
#endif
}
return v;
}
double CPyFloat_Tan(double x) {
if (unlikely(isinf(x))) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedFiniteInput(x);
#else
return CPy_DomainError();
#endif
}
return tan(x);
}
double CPyFloat_Sqrt(double x) {
if (x < 0.0) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedNonNegativeInputError(x);
#else
return CPy_DomainError();
#endif
}
return sqrt(x);
}
double CPyFloat_Exp(double x) {
double v = exp(x);
if (unlikely(v == INFINITY) && x != INFINITY) {
return CPy_MathRangeError();
}
return v;
}
double CPyFloat_Log(double x) {
if (x <= 0.0) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedPositiveInputError(x);
#else
return CPy_DomainError();
#endif
}
return log(x);
}
CPyTagged CPyFloat_Floor(double x) {
double v = floor(x);
return CPyTagged_FromFloat(v);
}
CPyTagged CPyFloat_Ceil(double x) {
double v = ceil(x);
return CPyTagged_FromFloat(v);
}
bool CPyFloat_IsInf(double x) {
return isinf(x) != 0;
}
bool CPyFloat_IsNaN(double x) {
return isnan(x) != 0;
}
// From CPython 3.10.0, Objects/floatobject.c
static void
_float_div_mod(double vx, double wx, double *floordiv, double *mod)
{
double div;
*mod = fmod(vx, wx);
/* fmod is typically exact, so vx-mod is *mathematically* an
exact multiple of wx. But this is fp arithmetic, and fp
vx - mod is an approximation; the result is that div may
not be an exact integral value after the division, although
it will always be very close to one.
*/
div = (vx - *mod) / wx;
if (*mod) {
/* ensure the remainder has the same sign as the denominator */
if ((wx < 0) != (*mod < 0)) {
*mod += wx;
div -= 1.0;
}
}
else {
/* the remainder is zero, and in the presence of signed zeroes
fmod returns different results across platforms; ensure
it has the same sign as the denominator. */
*mod = copysign(0.0, wx);
}
/* snap quotient to nearest integral value */
if (div) {
*floordiv = floor(div);
if (div - *floordiv > 0.5) {
*floordiv += 1.0;
}
}
else {
/* div is zero - get the same sign as the true quotient */
*floordiv = copysign(0.0, vx / wx); /* zero w/ sign of vx/wx */
}
}
double CPyFloat_FloorDivide(double x, double y) {
double mod, floordiv;
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "float floor division by zero");
return CPY_FLOAT_ERROR;
}
_float_div_mod(x, y, &floordiv, &mod);
return floordiv;
}
// Adapted from CPython 3.10.7
double CPyFloat_Pow(double x, double y) {
if (!isfinite(x) || !isfinite(y)) {
if (isnan(x))
return y == 0.0 ? 1.0 : x; /* NaN**0 = 1 */
else if (isnan(y))
return x == 1.0 ? 1.0 : y; /* 1**NaN = 1 */
else if (isinf(x)) {
int odd_y = isfinite(y) && fmod(fabs(y), 2.0) == 1.0;
if (y > 0.0)
return odd_y ? x : fabs(x);
else if (y == 0.0)
return 1.0;
else /* y < 0. */
return odd_y ? copysign(0.0, x) : 0.0;
}
else if (isinf(y)) {
if (fabs(x) == 1.0)
return 1.0;
else if (y > 0.0 && fabs(x) > 1.0)
return y;
else if (y < 0.0 && fabs(x) < 1.0) {
#if PY_VERSION_HEX < 0x030B0000
if (x == 0.0) { /* 0**-inf: divide-by-zero */
return CPy_DomainError();
}
#endif
return -y; /* result is +inf */
} else
return 0.0;
}
}
double r = pow(x, y);
if (!isfinite(r)) {
if (isnan(r)) {
return CPy_DomainError();
}
/*
an infinite result here arises either from:
(A) (+/-0.)**negative (-> divide-by-zero)
(B) overflow of x**y with x and y finite
*/
else if (isinf(r)) {
if (x == 0.0)
return CPy_DomainError();
else
return CPy_MathRangeError();
}
}
return r;
}

View file

@ -0,0 +1,271 @@
#define PY_SSIZE_T_CLEAN
#include <stdint.h>
#include "CPy.h"
#define CPyFunction_weakreflist(f) (((PyCFunctionObject *)f)->m_weakreflist)
#define CPyFunction_class(f) ((PyObject*) ((PyCMethodObject *) (f))->mm_class)
#define CPyFunction_func_vectorcall(f) (((PyCFunctionObject *)f)->vectorcall)
static int
CPyFunction_clear(CPyFunction *m) {
Py_CLEAR(((PyCFunctionObject*)m)->m_module);
PyObject_ClearManagedDict((PyObject*)m);
Py_CLEAR(m->func_name);
Py_CLEAR(m->func_code);
PyObject *cls = CPyFunction_class(m);
((PyCMethodObject *)m)->mm_class = NULL;
Py_XDECREF(cls);
return 0;
}
static void CPyFunction_dealloc(CPyFunction *m) {
PyObject_GC_UnTrack(m);
if (CPyFunction_weakreflist(m) != NULL)
PyObject_ClearWeakRefs((PyObject *) m);
CPyFunction_clear(m);
PyMem_Free(m->func.func.m_ml);
PyObject_GC_Del(m);
}
static PyObject* CPyFunction_repr(CPyFunction *op) {
return PyUnicode_FromFormat("<function %U at %p>", op->func_name, (void *)op);
}
static PyObject* CPyFunction_call(PyObject *func, PyObject *args, PyObject *kw) {
CPyFunction *f = (CPyFunction *)func;
vectorcallfunc vc = CPyFunction_func_vectorcall(f);
assert(vc);
return PyVectorcall_Call(func, args, kw);
}
static int CPyFunction_traverse(CPyFunction *m, visitproc visit, void *arg) {
Py_VISIT(((PyCFunctionObject *)m)->m_module);
int e = PyObject_VisitManagedDict((PyObject*)m, visit, arg);
if (e != 0) return e;
Py_VISIT(m->func_name);
Py_VISIT(m->func_code);
Py_VISIT(CPyFunction_class(m));
return 0;
}
static PyMemberDef CPyFunction_members[] = {
{"__module__", T_OBJECT, offsetof(PyCFunctionObject, m_module), 0, 0},
{"__vectorcalloffset__", T_PYSSIZET, offsetof(PyCFunctionObject, vectorcall), READONLY, 0},
{"__weaklistoffset__", T_PYSSIZET, offsetof(PyCFunctionObject, m_weakreflist), READONLY, 0},
{0, 0, 0, 0, 0}
};
PyObject* CPyFunction_get_name(PyObject *op, void *context) {
(void)context;
CPyFunction *func = (CPyFunction *)op;
if (unlikely(func->func_name == NULL)) {
func->func_name = PyUnicode_InternFromString(((PyCFunctionObject *)func)->m_ml->ml_name);
if (unlikely(func->func_name == NULL))
return NULL;
}
Py_INCREF(func->func_name);
return func->func_name;
}
int CPyFunction_set_name(PyObject *op, PyObject *value, void *context) {
(void)context;
CPyFunction *func = (CPyFunction *)op;
if (unlikely(!value || !PyUnicode_Check(value))) {
PyErr_SetString(PyExc_TypeError, "__name__ must be set to a string object");
return -1;
}
Py_INCREF(value);
Py_XDECREF(func->func_name);
func->func_name = value;
return 0;
}
PyObject* CPyFunction_get_code(PyObject *op, void *context) {
(void)context;
CPyFunction *func = (CPyFunction *)op;
PyObject* result = (func->func_code) ? func->func_code : Py_None;
Py_INCREF(result);
return result;
}
static PyObject* CPyFunction_get_none(PyObject *op, void *context) {
(void)op;
(void)context;
PyObject* result = Py_None;
Py_INCREF(result);
return result;
}
int CPyFunction_set_none(PyObject *op, PyObject *value, void *context) {
(void)op;
(void)value;
(void)context;
return 0;
}
PyObject* CPyFunction_get_defaults(PyObject *op, void *context) {
return CPyFunction_get_none(op, context);
}
PyObject* CPyFunction_get_kwdefaults(PyObject *op, void *context) {
return CPyFunction_get_none(op, context);
}
PyObject* CPyFunction_get_annotations(PyObject *op, void *context) {
return CPyFunction_get_none(op, context);
}
int CPyFunction_set_annotations(PyObject *op, PyObject *value, void *context) {
return CPyFunction_set_none(op, value, context);
}
static PyGetSetDef CPyFunction_getsets[] = {
{"__dict__", (getter)PyObject_GenericGetDict, (setter)PyObject_GenericSetDict, 0, 0},
{"__name__", (getter)CPyFunction_get_name, (setter)CPyFunction_set_name, 0, 0},
{"__code__", (getter)CPyFunction_get_code, 0, 0, 0},
{"__defaults__", (getter)CPyFunction_get_defaults, 0, 0, 0},
{"__kwdefaults__", (getter)CPyFunction_get_kwdefaults, 0, 0, 0},
{"__annotations__", (getter)CPyFunction_get_annotations, CPyFunction_set_annotations, 0, 0},
{0, 0, 0, 0, 0}
};
static PyObject* CPy_PyMethod_New(PyObject *func, PyObject *self, PyObject *typ) {
(void)typ;
if (!self) {
Py_INCREF(func);
return func;
}
return PyMethod_New(func, self);
}
static PyType_Slot CPyFunction_slots[] = {
{Py_tp_dealloc, (void *)CPyFunction_dealloc},
{Py_tp_repr, (void *)CPyFunction_repr},
{Py_tp_call, (void *)CPyFunction_call},
{Py_tp_traverse, (void *)CPyFunction_traverse},
{Py_tp_clear, (void *)CPyFunction_clear},
{Py_tp_members, (void *)CPyFunction_members},
{Py_tp_getset, (void *)CPyFunction_getsets},
{Py_tp_descr_get, (void *)CPy_PyMethod_New},
{0, 0},
};
static PyType_Spec CPyFunction_spec = {
.name = "Function compiled with mypyc",
.basicsize = sizeof(CPyFunction),
.itemsize = 0,
.flags = Py_TPFLAGS_IMMUTABLETYPE |
#if PY_VERSION_HEX >= 0x030C0000
Py_TPFLAGS_MANAGED_DICT |
#endif
Py_TPFLAGS_HAVE_VECTORCALL | Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_BASETYPE,
.slots = CPyFunction_slots,
};
static PyTypeObject *CPyFunctionType = NULL;
static PyObject* CPyFunction_Vectorcall(PyObject *func, PyObject *const *args, size_t nargsf, PyObject *kwnames) {
CPyFunction *f = (CPyFunction *)func;
Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
PyObject *self;
PyCFunction meth = ((PyCFunctionObject *)f)->m_ml->ml_meth;
self = ((PyCFunctionObject *)f)->m_self;
if (!self) {
self = args[0];
args += 1;
nargs -= 1;
}
return ((_PyCFunctionFastWithKeywords)(void(*)(void))meth)(self, args, nargs, kwnames);
}
static CPyFunction* CPyFunction_Init(CPyFunction *op, PyMethodDef *ml, PyObject* name,
PyObject *module, PyObject* code, bool set_self) {
PyCFunctionObject *cf = (PyCFunctionObject *)op;
CPyFunction_weakreflist(op) = NULL;
cf->m_ml = ml;
cf->m_self = set_self ? (PyObject *) op : NULL;
Py_XINCREF(module);
cf->m_module = module;
Py_INCREF(name);
op->func_name = name;
((PyCMethodObject *)op)->mm_class = NULL;
Py_XINCREF(code);
op->func_code = code;
CPyFunction_func_vectorcall(op) = CPyFunction_Vectorcall;
return op;
}
static PyObject* CPyCode_New(const char *filename, const char *funcname, int first_line, int flags) {
PyCodeObject *code_obj = PyCode_NewEmpty(filename, funcname, first_line);
if (unlikely(!code_obj)) {
return NULL;
}
code_obj->co_flags = flags;
return (PyObject *)code_obj;
}
static PyMethodDef* CPyMethodDef_New(const char *name, PyCFunction func, int flags, const char *doc) {
PyMethodDef *method = (PyMethodDef *)PyMem_Malloc(sizeof(PyMethodDef));
if (unlikely(!method)) {
return NULL;
}
method->ml_name = name;
method->ml_meth = func;
method->ml_flags = flags;
method->ml_doc = doc;
return method;
}
PyObject* CPyFunction_New(PyObject *module, const char *filename, const char *funcname,
PyCFunction func, int func_flags, const char *func_doc,
int first_line, int code_flags, bool has_self_arg) {
PyMethodDef *method = NULL;
PyObject *code = NULL, *op = NULL;
bool set_self = false;
if (!CPyFunctionType) {
CPyFunctionType = (PyTypeObject *)PyType_FromSpec(&CPyFunction_spec);
if (unlikely(!CPyFunctionType)) {
goto err;
}
}
method = CPyMethodDef_New(funcname, func, func_flags, func_doc);
if (unlikely(!method)) {
goto err;
}
code = CPyCode_New(filename, funcname, first_line, code_flags);
if (unlikely(!code)) {
goto err;
}
// Set m_self inside the function wrapper only if the wrapped function has no self arg
// to pass m_self as the self arg when the function is called.
// When the function has a self arg, it will come in the args vector passed to the
// vectorcall handler.
set_self = !has_self_arg;
op = (PyObject *)CPyFunction_Init(PyObject_GC_New(CPyFunction, CPyFunctionType),
method, PyUnicode_FromString(funcname), module,
code, set_self);
if (unlikely(!op)) {
goto err;
}
PyObject_GC_Track(op);
return op;
err:
CPyError_OutOfMemory();
if (method) {
PyMem_Free(method);
}
return NULL;
}

View file

@ -0,0 +1,84 @@
// Generic primitive operations
//
// These are registered in mypyc.primitives.generic_ops.
#include <Python.h>
#include "CPy.h"
CPyTagged CPyObject_Hash(PyObject *o) {
Py_hash_t h = PyObject_Hash(o);
if (h == -1) {
return CPY_INT_TAG;
} else {
// This is tragically annoying. The range of hash values in
// 64-bit python covers 64-bits, and our short integers only
// cover 63. This means that half the time we are boxing the
// result for basically no good reason. To add insult to
// injury it is probably about to be immediately unboxed by a
// tp_hash wrapper.
return CPyTagged_FromSsize_t(h);
}
}
PyObject *CPyObject_GetAttr3(PyObject *v, PyObject *name, PyObject *defl)
{
PyObject *result = PyObject_GetAttr(v, name);
if (!result && PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Clear();
Py_INCREF(defl);
result = defl;
}
return result;
}
PyObject *CPyIter_Next(PyObject *iter)
{
return (*Py_TYPE(iter)->tp_iternext)(iter);
}
PyObject *CPyNumber_Power(PyObject *base, PyObject *index)
{
return PyNumber_Power(base, index, Py_None);
}
PyObject *CPyNumber_InPlacePower(PyObject *base, PyObject *index)
{
return PyNumber_InPlacePower(base, index, Py_None);
}
PyObject *CPyObject_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
PyObject *start_obj = CPyTagged_AsObject(start);
PyObject *end_obj = CPyTagged_AsObject(end);
if (unlikely(start_obj == NULL || end_obj == NULL)) {
return NULL;
}
PyObject *slice = PySlice_New(start_obj, end_obj, NULL);
Py_DECREF(start_obj);
Py_DECREF(end_obj);
if (unlikely(slice == NULL)) {
return NULL;
}
PyObject *result = PyObject_GetItem(obj, slice);
Py_DECREF(slice);
return result;
}
typedef PyObject *(*SetupFunction)(PyObject *);
PyObject *CPy_SetupObject(PyObject *type) {
PyTypeObject *tp = (PyTypeObject *)type;
PyMethodDef *def = NULL;
for(; tp; tp = tp->tp_base) {
def = tp->tp_methods;
if (!def || !def->ml_name) {
continue;
}
if (!strcmp(def->ml_name, "__internal_mypyc_setup")) {
return ((SetupFunction)(void(*)(void))def->ml_meth)(type);
}
}
PyErr_SetString(PyExc_RuntimeError, "Internal mypyc error: Unable to find object setup function");
return NULL;
}

View file

@ -0,0 +1,451 @@
/* getargs implementation copied from Python 3.8 and stripped down to only include
* the functions we need.
* We also add support for required kwonly args and accepting *args / **kwargs.
* A good idea would be to also vendor in the Fast versions and get our stuff
* working with *that*.
* Another probably good idea is to strip out all the formatting stuff we don't need
* and then add in custom stuff that we do need.
*
* DOCUMENTATION OF THE EXTENSIONS:
* - Arguments given after a @ format specify are required keyword-only arguments.
* The | and $ specifiers must both appear before @.
* - If the first character of a format string is %, then the function can support
* *args and **kwargs. On seeing a %, the parser will consume two arguments,
* which should be pointers to variables to store the *args and **kwargs, respectively.
* Either pointer can be NULL, in which case the function doesn't take that
* variety of vararg.
* Unlike most format specifiers, the caller takes ownership of these objects
* and is responsible for decrefing them.
* - All arguments must use the 'O' format.
* - There's minimal error checking of format strings. They are generated
* programmatically and can be assumed valid.
*/
// These macro definitions are copied from pyport.h in Python 3.9 and later
// https://bugs.python.org/issue19569
#if defined(__clang__)
#define _Py_COMP_DIAG_PUSH _Pragma("clang diagnostic push")
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
_Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
#define _Py_COMP_DIAG_POP _Pragma("clang diagnostic pop")
#elif defined(__GNUC__) \
&& ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6))
#define _Py_COMP_DIAG_PUSH _Pragma("GCC diagnostic push")
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
#define _Py_COMP_DIAG_POP _Pragma("GCC diagnostic pop")
#elif defined(_MSC_VER)
#define _Py_COMP_DIAG_PUSH __pragma(warning(push))
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS __pragma(warning(disable: 4996))
#define _Py_COMP_DIAG_POP __pragma(warning(pop))
#else
#define _Py_COMP_DIAG_PUSH
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS
#define _Py_COMP_DIAG_POP
#endif
#include "Python.h"
#include "pythonsupport.h"
#include <ctype.h>
#include <float.h>
#ifndef PyDict_GET_SIZE
#define PyDict_GET_SIZE(d) PyDict_Size(d)
#endif
#ifdef __cplusplus
extern "C" {
#endif
int CPyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
const char *, const char *, const char * const *, ...);
/* Forward */
static int vgetargskeywords(PyObject *, PyObject *,
const char *, const char *, const char * const *, va_list *);
static void skipitem(const char **, va_list *);
/* Support for keyword arguments donated by
Geoff Philbrick <philbric@delphi.hks.com> */
/* Return false (0) for error, else true. */
int
CPyArg_ParseTupleAndKeywords(PyObject *args,
PyObject *keywords,
const char *format,
const char *fname,
const char * const *kwlist, ...)
{
int retval;
va_list va;
va_start(va, kwlist);
retval = vgetargskeywords(args, keywords, format, fname, kwlist, &va);
va_end(va);
return retval;
}
#define IS_END_OF_FORMAT(c) (c == '\0' || c == ';' || c == ':')
static int
vgetargskeywords(PyObject *args, PyObject *kwargs, const char *format,
const char *fname, const char * const *kwlist, va_list *p_va)
{
int min = INT_MAX;
int max = INT_MAX;
int required_kwonly_start = INT_MAX;
int has_required_kws = 0;
int i, pos, len;
int skip = 0;
Py_ssize_t nargs, nkwargs;
PyObject *current_arg;
int bound_pos_args;
PyObject **p_args = NULL, **p_kwargs = NULL;
assert(args != NULL && PyTuple_Check(args));
assert(kwargs == NULL || PyDict_Check(kwargs));
assert(format != NULL);
assert(kwlist != NULL);
assert(p_va != NULL);
/* scan kwlist and count the number of positional-only parameters */
for (pos = 0; kwlist[pos] && !*kwlist[pos]; pos++) {
}
/* scan kwlist and get greatest possible nbr of args */
for (len = pos; kwlist[len]; len++) {
#ifdef DEBUG
if (!*kwlist[len]) {
PyErr_SetString(PyExc_SystemError,
"Empty keyword parameter name");
return 0;
}
#endif
}
if (*format == '%') {
p_args = va_arg(*p_va, PyObject **);
p_kwargs = va_arg(*p_va, PyObject **);
format++;
}
nargs = PyTuple_GET_SIZE(args);
nkwargs = (kwargs == NULL) ? 0 : PyDict_GET_SIZE(kwargs);
if (unlikely(nargs + nkwargs > len && !p_args && !p_kwargs)) {
/* Adding "keyword" (when nargs == 0) prevents producing wrong error
messages in some special cases (see bpo-31229). */
PyErr_Format(PyExc_TypeError,
"%.200s%s takes at most %d %sargument%s (%zd given)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
len,
(nargs == 0) ? "keyword " : "",
(len == 1) ? "" : "s",
nargs + nkwargs);
return 0;
}
/* convert tuple args and keyword args in same loop, using kwlist to drive process */
for (i = 0; i < len; i++) {
if (*format == '|') {
#ifdef DEBUG
if (min != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (| specified twice)");
return 0;
}
#endif
min = i;
format++;
#ifdef DEBUG
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ before |)");
return 0;
}
#endif
/* If there are optional args, figure out whether we have
* required keyword arguments so that we don't bail without
* enforcing them. */
has_required_kws = strchr(format, '@') != NULL;
}
if (*format == '$') {
#ifdef DEBUG
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ specified twice)");
return 0;
}
#endif
max = i;
format++;
#ifdef DEBUG
if (max < pos) {
PyErr_SetString(PyExc_SystemError,
"Empty parameter name after $");
return 0;
}
#endif
if (skip) {
/* Now we know the minimal and the maximal numbers of
* positional arguments and can raise an exception with
* informative message (see below). */
break;
}
if (unlikely(max < nargs && !p_args)) {
if (max == 0) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()");
}
else {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s"
" (%zd given)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
(min < max) ? "at most" : "exactly",
max,
max == 1 ? "" : "s",
nargs);
}
return 0;
}
}
if (*format == '@') {
#ifdef DEBUG
if (min == INT_MAX && max == INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string "
"(@ without preceding | and $)");
return 0;
}
if (required_kwonly_start != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (@ specified twice)");
return 0;
}
#endif
required_kwonly_start = i;
format++;
}
#ifdef DEBUG
if (IS_END_OF_FORMAT(*format)) {
PyErr_Format(PyExc_SystemError,
"More keyword list entries (%d) than "
"format specifiers (%d)", len, i);
return 0;
}
#endif
if (!skip) {
if (i < nargs && i < max) {
current_arg = Py_NewRef(PyTuple_GET_ITEM(args, i));
}
else if (nkwargs && i >= pos) {
if (unlikely(PyDict_GetItemStringRef(kwargs, kwlist[i], &current_arg) < 0)) {
return 0;
}
if (current_arg) {
--nkwargs;
}
}
else {
current_arg = NULL;
}
if (current_arg) {
PyObject **p = va_arg(*p_va, PyObject **);
*p = current_arg;
Py_DECREF(current_arg);
format++;
continue;
}
if (i < min || i >= required_kwonly_start) {
if (likely(i < pos)) {
assert (min == INT_MAX);
assert (max == INT_MAX);
skip = 1;
/* At that moment we still don't know the minimal and
* the maximal numbers of positional arguments. Raising
* an exception is deferred until we encounter | and $
* or the end of the format. */
}
else {
if (i >= max) {
PyErr_Format(PyExc_TypeError,
"%.200s%s missing required "
"keyword-only argument '%s'",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
kwlist[i]);
}
else {
PyErr_Format(PyExc_TypeError,
"%.200s%s missing required "
"argument '%s' (pos %d)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
kwlist[i], i+1);
}
return 0;
}
}
/* current code reports success when all required args
* fulfilled and no keyword args left, with no further
* validation. XXX Maybe skip this in debug build ?
*/
if (!nkwargs && !skip && !has_required_kws &&
!p_args && !p_kwargs)
{
return 1;
}
}
/* We are into optional args, skip through to any remaining
* keyword args */
skipitem(&format, p_va);
}
if (unlikely(skip)) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s"
" (%zd given)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
(Py_MIN(pos, min) < i) ? "at least" : "exactly",
Py_MIN(pos, min),
Py_MIN(pos, min) == 1 ? "" : "s",
nargs);
return 0;
}
#ifdef DEBUG
if (!IS_END_OF_FORMAT(*format) &&
(*format != '|') && (*format != '$') && (*format != '@'))
{
PyErr_Format(PyExc_SystemError,
"more argument specifiers than keyword list entries "
"(remaining format:'%s')", format);
return 0;
}
#endif
bound_pos_args = Py_MIN(nargs, Py_MIN(max, len));
if (p_args) {
*p_args = PyTuple_GetSlice(args, bound_pos_args, nargs);
if (!*p_args) {
return 0;
}
}
if (p_kwargs) {
/* This unfortunately needs to be special cased because if len is 0 then we
* never go through the main loop. */
if (unlikely(nargs > 0 && len == 0 && !p_args)) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()");
return 0;
}
*p_kwargs = PyDict_New();
if (!*p_kwargs) {
goto latefail;
}
}
if (nkwargs > 0) {
PyObject *key, *value;
Py_ssize_t j;
/* make sure there are no arguments given by name and position */
for (i = pos; i < bound_pos_args && i < len; i++) {
PyObject *current_arg;
if (unlikely(PyDict_GetItemStringRef(kwargs, kwlist[i], &current_arg) < 0)) {
goto latefail;
}
if (unlikely(current_arg != NULL)) {
Py_DECREF(current_arg);
/* arg present in tuple and in dict */
PyErr_Format(PyExc_TypeError,
"argument for %.200s%s given by name ('%s') "
"and position (%d)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
kwlist[i], i+1);
goto latefail;
}
}
/* make sure there are no extraneous keyword arguments */
j = 0;
while (PyDict_Next(kwargs, &j, &key, &value)) {
int match = 0;
if (unlikely(!PyUnicode_Check(key))) {
PyErr_SetString(PyExc_TypeError,
"keywords must be strings");
goto latefail;
}
for (i = pos; i < len; i++) {
if (PyUnicode_EqualToUTF8(key, kwlist[i])) {
match = 1;
break;
}
}
if (!match) {
if (unlikely(!p_kwargs)) {
PyErr_Format(PyExc_TypeError,
"'%U' is an invalid keyword "
"argument for %.200s%s",
key,
(fname == NULL) ? "this function" : fname,
(fname == NULL) ? "" : "()");
goto latefail;
} else {
if (PyDict_SetItem(*p_kwargs, key, value) < 0) {
goto latefail;
}
}
}
}
}
return 1;
/* Handle failures that have happened after we have tried to
* create *args and **kwargs, if they exist. */
latefail:
if (p_args) {
Py_XDECREF(*p_args);
}
if (p_kwargs) {
Py_XDECREF(*p_kwargs);
}
return 0;
}
static void
skipitem(const char **p_format, va_list *p_va)
{
const char *format = *p_format;
format++;
if (p_va != NULL) {
(void) va_arg(*p_va, PyObject **);
}
*p_format = format;
}
#ifdef __cplusplus
};
#endif

View file

@ -0,0 +1,569 @@
/* getargskeywordsfast implementation copied from Python 3.9 and stripped down to
* only include the functionality we need.
*
* We also add support for required kwonly args and accepting *args / **kwargs.
*
* DOCUMENTATION OF THE EXTENSIONS:
* - Arguments given after a @ format specify required keyword-only arguments.
* The | and $ specifiers must both appear before @.
* - If the first character of a format string is %, then the function can support
* *args and/or **kwargs. In this case the parser will consume two arguments,
* which should be pointers to variables to store the *args and **kwargs, respectively.
* Either pointer can be NULL, in which case the function doesn't take that
* variety of vararg.
* Unlike most format specifiers, the caller takes ownership of these objects
* and is responsible for decrefing them.
*/
#include <Python.h>
#include "CPy.h"
#define PARSER_INITED(parser) ((parser)->kwtuple != NULL)
/* Forward */
static int
vgetargskeywordsfast_impl(PyObject *const *args, Py_ssize_t nargs,
PyObject *kwargs, PyObject *kwnames,
CPyArg_Parser *parser,
va_list *p_va);
static void skipitem_fast(const char **, va_list *);
/* Parse args for an arbitrary signature */
int
CPyArg_ParseStackAndKeywords(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
va_end(va);
return retval;
}
/* Parse args for a function that takes no args */
int
CPyArg_ParseStackAndKeywordsNoArgs(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
if (nargs == 0 && kwnames == NULL) {
// Fast path: no arguments
retval = 1;
} else {
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
}
va_end(va);
return retval;
}
/* Parse args for a function that takes one arg */
int
CPyArg_ParseStackAndKeywordsOneArg(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
if (kwnames == NULL && nargs == 1) {
// Fast path: one positional argument
PyObject **p;
p = va_arg(va, PyObject **);
*p = args[0];
retval = 1;
} else {
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
}
va_end(va);
return retval;
}
/* Parse args for a function that takes no keyword-only args, *args or **kwargs */
int
CPyArg_ParseStackAndKeywordsSimple(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
if (kwnames == NULL && PARSER_INITED(parser) &&
nargs >= parser->min && nargs <= parser->max) {
// Fast path: correct number of positional arguments only
PyObject **p;
Py_ssize_t i;
for (i = 0; i < nargs; i++) {
p = va_arg(va, PyObject **);
*p = args[i];
}
retval = 1;
} else {
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
}
va_end(va);
return retval;
}
#define IS_END_OF_FORMAT(c) (c == '\0' || c == ';' || c == ':')
/* List of static parsers. */
static struct CPyArg_Parser *static_arg_parsers = NULL;
static int
parser_init(CPyArg_Parser *parser)
{
const char * const *keywords;
const char *format;
int i, len, min, max, nkw;
PyObject *kwtuple;
assert(parser->keywords != NULL);
if (PARSER_INITED(parser)) {
return 1;
}
keywords = parser->keywords;
/* scan keywords and count the number of positional-only parameters */
for (i = 0; keywords[i] && !*keywords[i]; i++) {
}
parser->pos = i;
/* scan keywords and get greatest possible nbr of args */
for (; keywords[i]; i++) {
if (!*keywords[i]) {
PyErr_SetString(PyExc_SystemError,
"Empty keyword parameter name");
return 0;
}
}
len = i;
parser->required_kwonly_start = INT_MAX;
if (*parser->format == '%') {
parser->format++;
parser->varargs = 1;
}
format = parser->format;
if (format) {
/* grab the function name or custom error msg first (mutually exclusive) */
parser->fname = strchr(parser->format, ':');
if (parser->fname) {
parser->fname++;
parser->custom_msg = NULL;
}
else {
parser->custom_msg = strchr(parser->format,';');
if (parser->custom_msg)
parser->custom_msg++;
}
min = max = INT_MAX;
for (i = 0; i < len; i++) {
if (*format == '|') {
if (min != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (| specified twice)");
return 0;
}
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ before |)");
return 0;
}
min = i;
format++;
}
if (*format == '$') {
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ specified twice)");
return 0;
}
if (i < parser->pos) {
PyErr_SetString(PyExc_SystemError,
"Empty parameter name after $");
return 0;
}
max = i;
format++;
}
if (*format == '@') {
if (parser->required_kwonly_start != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (@ specified twice)");
return 0;
}
if (min == INT_MAX && max == INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string "
"(@ without preceding | and $)");
return 0;
}
format++;
parser->has_required_kws = 1;
parser->required_kwonly_start = i;
}
if (IS_END_OF_FORMAT(*format)) {
PyErr_Format(PyExc_SystemError,
"More keyword list entries (%d) than "
"format specifiers (%d)", len, i);
return 0;
}
skipitem_fast(&format, NULL);
}
parser->min = Py_MIN(min, len);
parser->max = Py_MIN(max, len);
if (!IS_END_OF_FORMAT(*format) && (*format != '|') && (*format != '$')) {
PyErr_Format(PyExc_SystemError,
"more argument specifiers than keyword list entries "
"(remaining format:'%s')", format);
return 0;
}
}
nkw = len - parser->pos;
kwtuple = PyTuple_New(nkw);
if (kwtuple == NULL) {
return 0;
}
keywords = parser->keywords + parser->pos;
for (i = 0; i < nkw; i++) {
PyObject *str = PyUnicode_FromString(keywords[i]);
if (str == NULL) {
Py_DECREF(kwtuple);
return 0;
}
PyUnicode_InternInPlace(&str);
PyTuple_SET_ITEM(kwtuple, i, str);
}
parser->kwtuple = kwtuple;
assert(parser->next == NULL);
parser->next = static_arg_parsers;
static_arg_parsers = parser;
return 1;
}
static PyObject*
find_keyword(PyObject *kwnames, PyObject *const *kwstack, PyObject *key)
{
Py_ssize_t i, nkwargs;
nkwargs = PyTuple_GET_SIZE(kwnames);
for (i = 0; i < nkwargs; i++) {
PyObject *kwname = PyTuple_GET_ITEM(kwnames, i);
/* kwname == key will normally find a match in since keyword keys
should be interned strings; if not retry below in a new loop. */
if (kwname == key) {
return kwstack[i];
}
}
for (i = 0; i < nkwargs; i++) {
PyObject *kwname = PyTuple_GET_ITEM(kwnames, i);
assert(PyUnicode_Check(kwname));
if (PyUnicode_Equal(kwname, key)) {
return kwstack[i];
}
}
return NULL;
}
static int
vgetargskeywordsfast_impl(PyObject *const *args, Py_ssize_t nargs,
PyObject *kwargs, PyObject *kwnames,
CPyArg_Parser *parser,
va_list *p_va)
{
PyObject *kwtuple;
const char *format;
PyObject *keyword;
int i, pos, len;
Py_ssize_t nkwargs;
PyObject *current_arg;
PyObject *const *kwstack = NULL;
int bound_pos_args;
PyObject **p_args = NULL, **p_kwargs = NULL;
assert(kwargs == NULL || PyDict_Check(kwargs));
assert(kwargs == NULL || kwnames == NULL);
assert(p_va != NULL);
if (!parser_init(parser)) {
return 0;
}
kwtuple = parser->kwtuple;
pos = parser->pos;
len = pos + (int)PyTuple_GET_SIZE(kwtuple);
if (parser->varargs) {
p_args = va_arg(*p_va, PyObject **);
p_kwargs = va_arg(*p_va, PyObject **);
}
if (kwargs != NULL) {
nkwargs = PyDict_GET_SIZE(kwargs);
}
else if (kwnames != NULL) {
nkwargs = PyTuple_GET_SIZE(kwnames);
kwstack = args + nargs;
}
else {
nkwargs = 0;
}
if (nargs + nkwargs > len && !p_args && !p_kwargs) {
/* Adding "keyword" (when nargs == 0) prevents producing wrong error
messages in some special cases (see bpo-31229). */
PyErr_Format(PyExc_TypeError,
"%.200s%s takes at most %d %sargument%s (%zd given)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
len,
(nargs == 0) ? "keyword " : "",
(len == 1) ? "" : "s",
nargs + nkwargs);
return 0;
}
if (parser->max < nargs && !p_args) {
if (parser->max == 0) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()");
}
else {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s (%zd given)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
(parser->min < parser->max) ? "at most" : "exactly",
parser->max,
parser->max == 1 ? "" : "s",
nargs);
}
return 0;
}
format = parser->format;
/* convert tuple args and keyword args in same loop, using kwtuple to drive process */
for (i = 0; i < len; i++) {
if (*format == '|') {
format++;
}
if (*format == '$') {
format++;
}
if (*format == '@') {
format++;
}
assert(!IS_END_OF_FORMAT(*format));
if (i < nargs && i < parser->max) {
current_arg = args[i];
}
else if (nkwargs && i >= pos) {
keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
if (kwargs != NULL) {
current_arg = PyDict_GetItemWithError(kwargs, keyword);
if (!current_arg && PyErr_Occurred()) {
return 0;
}
}
else {
current_arg = find_keyword(kwnames, kwstack, keyword);
}
if (current_arg) {
--nkwargs;
}
}
else {
current_arg = NULL;
}
if (current_arg) {
PyObject **p = va_arg(*p_va, PyObject **);
*p = current_arg;
format++;
continue;
}
if (i < parser->min || i >= parser->required_kwonly_start) {
/* Less arguments than required */
if (i < pos) {
Py_ssize_t min = Py_MIN(pos, parser->min);
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s"
" (%zd given)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
min < parser->max ? "at least" : "exactly",
min,
min == 1 ? "" : "s",
nargs);
}
else {
keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
if (i >= parser->max) {
PyErr_Format(PyExc_TypeError, "%.200s%s missing required "
"keyword-only argument '%U'",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
keyword);
}
else {
PyErr_Format(PyExc_TypeError, "%.200s%s missing required "
"argument '%U' (pos %d)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
keyword, i+1);
}
}
return 0;
}
/* current code reports success when all required args
* fulfilled and no keyword args left, with no further
* validation. XXX Maybe skip this in debug build ?
*/
if (!nkwargs && !parser->has_required_kws && !p_args && !p_kwargs) {
return 1;
}
/* We are into optional args, skip through to any remaining
* keyword args */
skipitem_fast(&format, p_va);
}
assert(IS_END_OF_FORMAT(*format) || (*format == '|') || (*format == '$'));
bound_pos_args = Py_MIN(nargs, Py_MIN(parser->max, len));
if (p_args) {
*p_args = PyTuple_New(nargs - bound_pos_args);
if (!*p_args) {
return 0;
}
for (i = bound_pos_args; i < nargs; i++) {
PyObject *arg = args[i];
Py_INCREF(arg);
PyTuple_SET_ITEM(*p_args, i - bound_pos_args, arg);
}
}
if (p_kwargs) {
/* This unfortunately needs to be special cased because if len is 0 then we
* never go through the main loop. */
if (nargs > 0 && len == 0 && !p_args) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()");
return 0;
}
*p_kwargs = PyDict_New();
if (!*p_kwargs) {
goto latefail;
}
}
if (nkwargs > 0) {
Py_ssize_t j;
PyObject *value;
/* make sure there are no arguments given by name and position */
for (i = pos; i < bound_pos_args; i++) {
keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
if (kwargs != NULL) {
current_arg = PyDict_GetItemWithError(kwargs, keyword);
if (!current_arg && PyErr_Occurred()) {
goto latefail;
}
}
else {
current_arg = find_keyword(kwnames, kwstack, keyword);
}
if (current_arg) {
/* arg present in tuple and in dict */
PyErr_Format(PyExc_TypeError,
"argument for %.200s%s given by name ('%U') "
"and position (%d)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
keyword, i+1);
goto latefail;
}
}
/* make sure there are no extraneous keyword arguments */
j = 0;
while (1) {
int match;
if (kwargs != NULL) {
if (!PyDict_Next(kwargs, &j, &keyword, &value))
break;
}
else {
if (j >= PyTuple_GET_SIZE(kwnames))
break;
keyword = PyTuple_GET_ITEM(kwnames, j);
value = kwstack[j];
j++;
}
match = PySequence_Contains(kwtuple, keyword);
if (match <= 0) {
if (!match) {
if (!p_kwargs) {
PyErr_Format(PyExc_TypeError,
"'%S' is an invalid keyword "
"argument for %.200s%s",
keyword,
(parser->fname == NULL) ? "this function" : parser->fname,
(parser->fname == NULL) ? "" : "()");
goto latefail;
} else {
if (PyDict_SetItem(*p_kwargs, keyword, value) < 0) {
goto latefail;
}
}
} else {
goto latefail;
}
}
}
}
return 1;
/* Handle failures that have happened after we have tried to
* create *args and **kwargs, if they exist. */
latefail:
if (p_args) {
Py_XDECREF(*p_args);
}
if (p_kwargs) {
Py_XDECREF(*p_kwargs);
}
return 0;
}
static void
skipitem_fast(const char **p_format, va_list *p_va)
{
const char *format = *p_format;
char c = *format++;
if (p_va != NULL) {
(void) va_arg(*p_va, PyObject **);
}
*p_format = format;
}

View file

@ -0,0 +1,25 @@
#include <Python.h>
#include "CPy.h"
#include "static_data.c"
struct ExcDummyStruct _CPy_ExcDummyStruct = { PyObject_HEAD_INIT(NULL) };
PyObject *_CPy_ExcDummy = (PyObject *)&_CPy_ExcDummyStruct;
// System-wide empty tuple constant
PyObject * __mypyc_empty_tuple__ = NULL;
// Because its dynamic linker is more restricted than linux/OS X,
// Windows doesn't allow initializing globals with values from
// other dynamic libraries. This means we need to initialize
// things at load time.
void CPy_Init(void) {
_CPy_ExcDummyStruct.ob_base.ob_type = &PyBaseObject_Type;
// Initialize system-wide empty tuple constant
if (__mypyc_empty_tuple__ == NULL) {
__mypyc_empty_tuple__ = PyTuple_New(0);
if (!__mypyc_empty_tuple__) {
CPyError_OutOfMemory();
}
}
}

View file

@ -0,0 +1,709 @@
// Int primitive operations (tagged arbitrary-precision integers)
//
// These are registered in mypyc.primitives.int_ops.
#include <Python.h>
#include "CPy.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifndef _WIN32
// On 64-bit Linux and macOS, ssize_t and long are both 64 bits, and
// PyLong_FromLong is faster than PyLong_FromSsize_t, so use the faster one
#define CPyLong_FromSsize_t PyLong_FromLong
#else
// On 64-bit Windows, ssize_t is 64 bits but long is 32 bits, so we
// can't use the above trick
#define CPyLong_FromSsize_t PyLong_FromSsize_t
#endif
#if defined(__GNUC__) || defined(__clang__)
# if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8)
# define CPY_CLZ(x) __builtin_clzll((unsigned long long)(x))
# define CPY_BITS 64
# else
# define CPY_CLZ(x) __builtin_clz((unsigned int)(x))
# define CPY_BITS 32
# endif
#endif
CPyTagged CPyTagged_FromSsize_t(Py_ssize_t value) {
// We use a Python object if the value shifted left by 1 is too
// large for Py_ssize_t
if (unlikely(CPyTagged_TooBig(value))) {
PyObject *object = PyLong_FromSsize_t(value);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return value << 1;
}
}
CPyTagged CPyTagged_FromVoidPtr(void *ptr) {
if ((uintptr_t)ptr > PY_SSIZE_T_MAX) {
PyObject *object = PyLong_FromVoidPtr(ptr);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return CPyTagged_FromSsize_t((Py_ssize_t)ptr);
}
}
CPyTagged CPyTagged_FromInt64(int64_t value) {
if (unlikely(CPyTagged_TooBigInt64(value))) {
PyObject *object = PyLong_FromLongLong(value);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return value << 1;
}
}
PyObject *CPyTagged_AsObject(CPyTagged x) {
PyObject *value;
if (unlikely(CPyTagged_CheckLong(x))) {
value = CPyTagged_LongAsObject(x);
Py_INCREF(value);
} else {
value = CPyLong_FromSsize_t(CPyTagged_ShortAsSsize_t(x));
if (value == NULL) {
CPyError_OutOfMemory();
}
}
return value;
}
PyObject *CPyTagged_StealAsObject(CPyTagged x) {
PyObject *value;
if (unlikely(CPyTagged_CheckLong(x))) {
value = CPyTagged_LongAsObject(x);
} else {
value = CPyLong_FromSsize_t(CPyTagged_ShortAsSsize_t(x));
if (value == NULL) {
CPyError_OutOfMemory();
}
}
return value;
}
Py_ssize_t CPyTagged_AsSsize_t(CPyTagged x) {
if (likely(CPyTagged_CheckShort(x))) {
return CPyTagged_ShortAsSsize_t(x);
} else {
return PyLong_AsSsize_t(CPyTagged_LongAsObject(x));
}
}
CPy_NOINLINE
void CPyTagged_IncRef(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
Py_INCREF(CPyTagged_LongAsObject(x));
}
}
CPy_NOINLINE
void CPyTagged_DecRef(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
Py_DECREF(CPyTagged_LongAsObject(x));
}
}
CPy_NOINLINE
void CPyTagged_XDecRef(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
Py_XDECREF(CPyTagged_LongAsObject(x));
}
}
// Tagged int negation slow path, where the result may be a long integer
CPyTagged CPyTagged_Negate_(CPyTagged num) {
PyObject *num_obj = CPyTagged_AsObject(num);
PyObject *result = PyNumber_Negative(num_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(num_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int addition slow path, where the result may be a long integer
CPyTagged CPyTagged_Add_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Add(left_obj, right_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(left_obj);
Py_DECREF(right_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int subtraction slow path, where the result may be a long integer
CPyTagged CPyTagged_Subtract_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Subtract(left_obj, right_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(left_obj);
Py_DECREF(right_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int multiplication slow path, where the result may be a long integer
CPyTagged CPyTagged_Multiply_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Multiply(left_obj, right_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(left_obj);
Py_DECREF(right_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int // slow path, where the result may be a long integer (or raise)
CPyTagged CPyTagged_FloorDivide_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_FloorDivide(left_obj, right_obj);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
// Handle exceptions honestly because it could be ZeroDivisionError
if (result == NULL) {
return CPY_INT_TAG;
} else {
return CPyTagged_StealFromObject(result);
}
}
// Tagged int % slow path, where the result may be a long integer (or raise)
CPyTagged CPyTagged_Remainder_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Remainder(left_obj, right_obj);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
// Handle exceptions honestly because it could be ZeroDivisionError
if (result == NULL) {
return CPY_INT_TAG;
} else {
return CPyTagged_StealFromObject(result);
}
}
bool CPyTagged_IsEq_(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(right)) {
return false;
} else {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
int result = PyObject_RichCompareBool(left_obj, right_obj, Py_EQ);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
if (result == -1) {
CPyError_OutOfMemory();
}
return result;
}
}
bool CPyTagged_IsLt_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
int result = PyObject_RichCompareBool(left_obj, right_obj, Py_LT);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
if (result == -1) {
CPyError_OutOfMemory();
}
return result;
}
PyObject *CPyLong_FromStrWithBase(PyObject *o, CPyTagged base) {
Py_ssize_t base_size_t = CPyTagged_AsSsize_t(base);
return PyLong_FromUnicodeObject(o, base_size_t);
}
PyObject *CPyLong_FromStr(PyObject *o) {
CPyTagged base = CPyTagged_FromSsize_t(10);
return CPyLong_FromStrWithBase(o, base);
}
CPyTagged CPyTagged_FromFloat(double f) {
if (f < ((double)CPY_TAGGED_MAX + 1.0) && f > (CPY_TAGGED_MIN - 1.0)) {
return (Py_ssize_t)f << 1;
}
PyObject *o = PyLong_FromDouble(f);
if (o == NULL)
return CPY_INT_TAG;
return CPyTagged_StealFromObject(o);
}
PyObject *CPyBool_Str(bool b) {
return PyObject_Str(b ? Py_True : Py_False);
}
// Bitwise op '&', '|' or '^' using the generic (slow) API
static CPyTagged GenericBitwiseOp(CPyTagged a, CPyTagged b, char op) {
PyObject *aobj = CPyTagged_AsObject(a);
PyObject *bobj = CPyTagged_AsObject(b);
PyObject *r;
if (op == '&') {
r = PyNumber_And(aobj, bobj);
} else if (op == '|') {
r = PyNumber_Or(aobj, bobj);
} else {
r = PyNumber_Xor(aobj, bobj);
}
if (unlikely(r == NULL)) {
CPyError_OutOfMemory();
}
Py_DECREF(aobj);
Py_DECREF(bobj);
return CPyTagged_StealFromObject(r);
}
// Return pointer to digits of a PyLong object. If it's a short
// integer, place digits in the buffer buf instead to avoid memory
// allocation (it's assumed to be big enough). Return the number of
// digits in *size. *size is negative if the integer is negative.
static digit *GetIntDigits(CPyTagged n, Py_ssize_t *size, digit *buf) {
if (CPyTagged_CheckShort(n)) {
Py_ssize_t val = CPyTagged_ShortAsSsize_t(n);
bool neg = val < 0;
int len = 1;
if (neg) {
val = -val;
}
buf[0] = val & PyLong_MASK;
if (val > (Py_ssize_t)PyLong_MASK) {
val >>= PyLong_SHIFT;
buf[1] = val & PyLong_MASK;
if (val > (Py_ssize_t)PyLong_MASK) {
buf[2] = val >> PyLong_SHIFT;
len = 3;
} else {
len = 2;
}
}
*size = neg ? -len : len;
return buf;
} else {
PyLongObject *obj = (PyLongObject *)CPyTagged_LongAsObject(n);
*size = CPY_LONG_SIZE_SIGNED(obj);
return &CPY_LONG_DIGIT(obj, 0);
}
}
// Shared implementation of bitwise '&', '|' and '^' (specified by op) for at least
// one long operand. This is somewhat optimized for performance.
CPyTagged CPyTagged_BitwiseLongOp_(CPyTagged a, CPyTagged b, char op) {
// Directly access the digits, as there is no fast C API function for this.
digit abuf[3];
digit bbuf[3];
Py_ssize_t asize;
Py_ssize_t bsize;
digit *adigits = GetIntDigits(a, &asize, abuf);
digit *bdigits = GetIntDigits(b, &bsize, bbuf);
if (unlikely(asize < 0 || bsize < 0)) {
// Negative operand. This is slower, but bitwise ops on them are pretty rare.
return GenericBitwiseOp(a, b, op);
}
// Optimized implementation for two non-negative integers.
// Swap a and b as needed to ensure a is no longer than b.
if (asize > bsize) {
digit *tmp = adigits;
adigits = bdigits;
bdigits = tmp;
Py_ssize_t tmp_size = asize;
asize = bsize;
bsize = tmp_size;
}
void *digits = NULL;
PyLongWriter *writer = PyLongWriter_Create(0, op == '&' ? asize : bsize, &digits);
if (unlikely(writer == NULL)) {
CPyError_OutOfMemory();
}
Py_ssize_t i;
if (op == '&') {
for (i = 0; i < asize; i++) {
((digit *)digits)[i] = adigits[i] & bdigits[i];
}
} else {
if (op == '|') {
for (i = 0; i < asize; i++) {
((digit *)digits)[i] = adigits[i] | bdigits[i];
}
} else {
for (i = 0; i < asize; i++) {
((digit *)digits)[i] = adigits[i] ^ bdigits[i];
}
}
for (; i < bsize; i++) {
((digit *)digits)[i] = bdigits[i];
}
}
return CPyTagged_StealFromObject(PyLongWriter_Finish(writer));
}
// Bitwise '~' slow path
CPyTagged CPyTagged_Invert_(CPyTagged num) {
PyObject *obj = CPyTagged_AsObject(num);
PyObject *result = PyNumber_Invert(obj);
if (unlikely(result == NULL)) {
CPyError_OutOfMemory();
}
Py_DECREF(obj);
return CPyTagged_StealFromObject(result);
}
// Bitwise '>>' slow path
CPyTagged CPyTagged_Rshift_(CPyTagged left, CPyTagged right) {
// Long integer or negative shift -- use generic op
PyObject *lobj = CPyTagged_AsObject(left);
PyObject *robj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Rshift(lobj, robj);
Py_DECREF(lobj);
Py_DECREF(robj);
if (result == NULL) {
// Propagate error (could be negative shift count)
return CPY_INT_TAG;
}
return CPyTagged_StealFromObject(result);
}
// Bitwise '<<' slow path
CPyTagged CPyTagged_Lshift_(CPyTagged left, CPyTagged right) {
// Long integer or out of range shift -- use generic op
PyObject *lobj = CPyTagged_AsObject(left);
PyObject *robj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Lshift(lobj, robj);
Py_DECREF(lobj);
Py_DECREF(robj);
if (result == NULL) {
// Propagate error (could be negative shift count)
return CPY_INT_TAG;
}
return CPyTagged_StealFromObject(result);
}
// i64 unboxing slow path
int64_t CPyLong_AsInt64_(PyObject *o) {
int overflow;
int64_t result = PyLong_AsLongLongAndOverflow(o, &overflow);
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_INT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_ValueError, "int too large to convert to i64");
return CPY_LL_INT_ERROR;
}
}
return result;
}
int64_t CPyInt64_Divide(int64_t x, int64_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
if (y == -1 && x == INT64_MIN) {
PyErr_SetString(PyExc_OverflowError, "integer division overflow");
return CPY_LL_INT_ERROR;
}
int64_t d = x / y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d * y != x) {
d--;
}
return d;
}
int64_t CPyInt64_Remainder(int64_t x, int64_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
// Edge case: avoid core dump
if (y == -1 && x == INT64_MIN) {
return 0;
}
int64_t d = x % y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d != 0) {
d += y;
}
return d;
}
// i32 unboxing slow path
int32_t CPyLong_AsInt32_(PyObject *o) {
int overflow;
long result = PyLong_AsLongAndOverflow(o, &overflow);
if (result > 0x7fffffffLL || result < -0x80000000LL) {
overflow = 1;
result = -1;
}
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_INT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_ValueError, "int too large to convert to i32");
return CPY_LL_INT_ERROR;
}
}
return result;
}
int32_t CPyInt32_Divide(int32_t x, int32_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
if (y == -1 && x == INT32_MIN) {
PyErr_SetString(PyExc_OverflowError, "integer division overflow");
return CPY_LL_INT_ERROR;
}
int32_t d = x / y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d * y != x) {
d--;
}
return d;
}
int32_t CPyInt32_Remainder(int32_t x, int32_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
// Edge case: avoid core dump
if (y == -1 && x == INT32_MIN) {
return 0;
}
int32_t d = x % y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d != 0) {
d += y;
}
return d;
}
void CPyInt32_Overflow() {
PyErr_SetString(PyExc_ValueError, "int too large to convert to i32");
}
// i16 unboxing slow path
int16_t CPyLong_AsInt16_(PyObject *o) {
int overflow;
long result = PyLong_AsLongAndOverflow(o, &overflow);
if (result > 0x7fff || result < -0x8000) {
overflow = 1;
result = -1;
}
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_INT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_ValueError, "int too large to convert to i16");
return CPY_LL_INT_ERROR;
}
}
return result;
}
int16_t CPyInt16_Divide(int16_t x, int16_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
if (y == -1 && x == INT16_MIN) {
PyErr_SetString(PyExc_OverflowError, "integer division overflow");
return CPY_LL_INT_ERROR;
}
int16_t d = x / y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d * y != x) {
d--;
}
return d;
}
int16_t CPyInt16_Remainder(int16_t x, int16_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
// Edge case: avoid core dump
if (y == -1 && x == INT16_MIN) {
return 0;
}
int16_t d = x % y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d != 0) {
d += y;
}
return d;
}
void CPyInt16_Overflow() {
PyErr_SetString(PyExc_ValueError, "int too large to convert to i16");
}
// u8 unboxing slow path
uint8_t CPyLong_AsUInt8_(PyObject *o) {
int overflow;
long result = PyLong_AsLongAndOverflow(o, &overflow);
if (result < 0 || result >= 256) {
overflow = 1;
result = -1;
}
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_UINT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_ValueError, "int too large or small to convert to u8");
return CPY_LL_UINT_ERROR;
}
}
return result;
}
void CPyUInt8_Overflow() {
PyErr_SetString(PyExc_ValueError, "int too large or small to convert to u8");
}
double CPyTagged_TrueDivide(CPyTagged x, CPyTagged y) {
if (unlikely(y == 0)) {
PyErr_SetString(PyExc_ZeroDivisionError, "division by zero");
return CPY_FLOAT_ERROR;
}
if (likely(!CPyTagged_CheckLong(x) && !CPyTagged_CheckLong(y))) {
return (double)((Py_ssize_t)x >> 1) / (double)((Py_ssize_t)y >> 1);
} else {
PyObject *xo = CPyTagged_AsObject(x);
PyObject *yo = CPyTagged_AsObject(y);
PyObject *result = PyNumber_TrueDivide(xo, yo);
if (result == NULL) {
return CPY_FLOAT_ERROR;
}
return PyFloat_AsDouble(result);
}
return 1.0;
}
static PyObject *CPyLong_ToBytes(PyObject *v, Py_ssize_t length, int little_endian, int signed_flag) {
// This is a wrapper for PyLong_AsByteArray and PyBytes_FromStringAndSize
PyObject *result = PyBytes_FromStringAndSize(NULL, length);
if (!result) {
return NULL;
}
unsigned char *bytes = (unsigned char *)PyBytes_AS_STRING(result);
#if PY_VERSION_HEX >= 0x030D0000 // 3.13.0
int res = _PyLong_AsByteArray((PyLongObject *)v, bytes, length, little_endian, signed_flag, 1);
#else
int res = _PyLong_AsByteArray((PyLongObject *)v, bytes, length, little_endian, signed_flag);
#endif
if (res < 0) {
Py_DECREF(result);
return NULL;
}
return result;
}
// int.to_bytes(length, byteorder, signed=False)
PyObject *CPyTagged_ToBytes(CPyTagged self, Py_ssize_t length, PyObject *byteorder, int signed_flag) {
PyObject *pyint = CPyTagged_AsObject(self);
if (!PyUnicode_Check(byteorder)) {
Py_DECREF(pyint);
PyErr_SetString(PyExc_TypeError, "byteorder must be str");
return NULL;
}
const char *order = PyUnicode_AsUTF8(byteorder);
if (!order) {
Py_DECREF(pyint);
return NULL;
}
int little_endian;
if (strcmp(order, "big") == 0) {
little_endian = 0;
} else if (strcmp(order, "little") == 0) {
little_endian = 1;
} else {
PyErr_SetString(PyExc_ValueError, "byteorder must be either 'little' or 'big'");
return NULL;
}
PyObject *result = CPyLong_ToBytes(pyint, length, little_endian, signed_flag);
Py_DECREF(pyint);
return result;
}
// int.to_bytes(length, byteorder="little", signed=False)
PyObject *CPyTagged_ToLittleEndianBytes(CPyTagged self, Py_ssize_t length, int signed_flag) {
PyObject *pyint = CPyTagged_AsObject(self);
PyObject *result = CPyLong_ToBytes(pyint, length, 1, signed_flag);
Py_DECREF(pyint);
return result;
}
// int.to_bytes(length, "big", signed=False)
PyObject *CPyTagged_ToBigEndianBytes(CPyTagged self, Py_ssize_t length, int signed_flag) {
PyObject *pyint = CPyTagged_AsObject(self);
PyObject *result = CPyLong_ToBytes(pyint, length, 0, signed_flag);
Py_DECREF(pyint);
return result;
}
// int.bit_length()
CPyTagged CPyTagged_BitLength(CPyTagged self) {
// Handle zero
if (self == 0) {
return 0;
}
// Fast path for small (tagged) ints
if (CPyTagged_CheckShort(self)) {
Py_ssize_t val = CPyTagged_ShortAsSsize_t(self);
Py_ssize_t absval = val < 0 ? -val : val;
int bits = 0;
if (absval) {
#if defined(_MSC_VER)
#if defined(_WIN64)
unsigned long idx;
if (_BitScanReverse64(&idx, (unsigned __int64)absval)) {
bits = (int)(idx + 1);
}
#else
unsigned long idx;
if (_BitScanReverse(&idx, (unsigned long)absval)) {
bits = (int)(idx + 1);
}
#endif
#elif defined(__GNUC__) || defined(__clang__)
bits = (int)(CPY_BITS - CPY_CLZ(absval));
#else
// Fallback to loop if no builtin
while (absval) {
absval >>= 1;
bits++;
}
#endif
}
return bits << 1;
}
// Slow path for big ints
PyObject *pyint = CPyTagged_AsObject(self);
int bits = _PyLong_NumBits(pyint);
Py_DECREF(pyint);
if (bits < 0) {
// _PyLong_NumBits sets an error on failure
return CPY_INT_TAG;
}
return bits << 1;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,113 @@
#ifndef LIBRT_INTERNAL_H
#define LIBRT_INTERNAL_H
#include <Python.h>
// ABI version -- only an exact match is compatible. This will only be changed in
// very exceptional cases (likely never) due to strict backward compatibility
// requirements.
#define LIBRT_INTERNAL_ABI_VERSION 2
// API version -- more recent versions must maintain backward compatibility, i.e.
// we can add new features but not remove or change existing features (unless
// ABI version is changed, but see the comment above).
#define LIBRT_INTERNAL_API_VERSION 0
// Number of functions in the capsule API. If you add a new function, also increase
// LIBRT_INTERNAL_API_VERSION.
#define LIBRT_INTERNAL_API_LEN 20
#ifdef LIBRT_INTERNAL_MODULE
static PyObject *ReadBuffer_internal(PyObject *source);
static PyObject *WriteBuffer_internal(void);
static PyObject *WriteBuffer_getvalue_internal(PyObject *self);
static PyObject *ReadBuffer_internal(PyObject *source);
static PyObject *ReadBuffer_internal_empty(void);
static char write_bool_internal(PyObject *data, char value);
static char read_bool_internal(PyObject *data);
static char write_str_internal(PyObject *data, PyObject *value);
static PyObject *read_str_internal(PyObject *data);
static char write_float_internal(PyObject *data, double value);
static double read_float_internal(PyObject *data);
static char write_int_internal(PyObject *data, CPyTagged value);
static CPyTagged read_int_internal(PyObject *data);
static char write_tag_internal(PyObject *data, uint8_t value);
static uint8_t read_tag_internal(PyObject *data);
static int NativeInternal_ABI_Version(void);
static char write_bytes_internal(PyObject *data, PyObject *value);
static PyObject *read_bytes_internal(PyObject *data);
static uint8_t cache_version_internal(void);
static PyTypeObject *ReadBuffer_type_internal(void);
static PyTypeObject *WriteBuffer_type_internal(void);
static int NativeInternal_API_Version(void);
#else
static void *NativeInternal_API[LIBRT_INTERNAL_API_LEN];
#define ReadBuffer_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[0])
#define WriteBuffer_internal (*(PyObject* (*)(void)) NativeInternal_API[1])
#define WriteBuffer_getvalue_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[2])
#define write_bool_internal (*(char (*)(PyObject *source, char value)) NativeInternal_API[3])
#define read_bool_internal (*(char (*)(PyObject *source)) NativeInternal_API[4])
#define write_str_internal (*(char (*)(PyObject *source, PyObject *value)) NativeInternal_API[5])
#define read_str_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[6])
#define write_float_internal (*(char (*)(PyObject *source, double value)) NativeInternal_API[7])
#define read_float_internal (*(double (*)(PyObject *source)) NativeInternal_API[8])
#define write_int_internal (*(char (*)(PyObject *source, CPyTagged value)) NativeInternal_API[9])
#define read_int_internal (*(CPyTagged (*)(PyObject *source)) NativeInternal_API[10])
#define write_tag_internal (*(char (*)(PyObject *source, uint8_t value)) NativeInternal_API[11])
#define read_tag_internal (*(uint8_t (*)(PyObject *source)) NativeInternal_API[12])
#define NativeInternal_ABI_Version (*(int (*)(void)) NativeInternal_API[13])
#define write_bytes_internal (*(char (*)(PyObject *source, PyObject *value)) NativeInternal_API[14])
#define read_bytes_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[15])
#define cache_version_internal (*(uint8_t (*)(void)) NativeInternal_API[16])
#define ReadBuffer_type_internal (*(PyTypeObject* (*)(void)) NativeInternal_API[17])
#define WriteBuffer_type_internal (*(PyTypeObject* (*)(void)) NativeInternal_API[18])
#define NativeInternal_API_Version (*(int (*)(void)) NativeInternal_API[19])
static int
import_librt_internal(void)
{
PyObject *mod = PyImport_ImportModule("librt.internal");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
void *capsule = PyCapsule_Import("librt.internal._C_API", 0);
if (capsule == NULL)
return -1;
memcpy(NativeInternal_API, capsule, sizeof(NativeInternal_API));
if (NativeInternal_ABI_Version() != LIBRT_INTERNAL_ABI_VERSION) {
char err[128];
snprintf(err, sizeof(err), "ABI version conflict for librt.internal, expected %d, found %d",
LIBRT_INTERNAL_ABI_VERSION,
NativeInternal_ABI_Version()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
if (NativeInternal_API_Version() < LIBRT_INTERNAL_API_VERSION) {
char err[128];
snprintf(err, sizeof(err),
"API version conflict for librt.internal, expected %d or newer, found %d (hint: upgrade librt)",
LIBRT_INTERNAL_API_VERSION,
NativeInternal_API_Version()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
return 0;
}
#endif
static inline bool CPyReadBuffer_Check(PyObject *obj) {
return Py_TYPE(obj) == ReadBuffer_type_internal();
}
static inline bool CPyWriteBuffer_Check(PyObject *obj) {
return Py_TYPE(obj) == WriteBuffer_type_internal();
}
#endif // LIBRT_INTERNAL_H

View file

@ -0,0 +1,395 @@
// List primitive operations
//
// These are registered in mypyc.primitives.list_ops.
#include <Python.h>
#include "CPy.h"
#ifndef Py_TPFLAGS_SEQUENCE
#define Py_TPFLAGS_SEQUENCE (1 << 5)
#endif
PyObject *CPyList_Build(Py_ssize_t len, ...) {
Py_ssize_t i;
PyObject *res = PyList_New(len);
if (res == NULL) {
return NULL;
}
va_list args;
va_start(args, len);
for (i = 0; i < len; i++) {
// Steals the reference
PyObject *value = va_arg(args, PyObject *);
PyList_SET_ITEM(res, i, value);
}
va_end(args);
return res;
}
char CPyList_Clear(PyObject *list) {
if (PyList_CheckExact(list)) {
PyList_Clear(list);
} else {
PyObject *res = PyObject_CallMethodNoArgs(list, mypyc_interned_str.clear);
if (res == NULL) {
return 0;
}
}
return 1;
}
PyObject *CPyList_Copy(PyObject *list) {
if(PyList_CheckExact(list)) {
return PyList_GetSlice(list, 0, PyList_GET_SIZE(list));
}
return PyObject_CallMethodNoArgs(list, mypyc_interned_str.copy);
}
PyObject *CPyList_GetItemShort(PyObject *list, CPyTagged index) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
PyObject *result = PyList_GET_ITEM(list, n);
Py_INCREF(result);
return result;
}
PyObject *CPyList_GetItemShortBorrow(PyObject *list, CPyTagged index) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
return PyList_GET_ITEM(list, n);
}
PyObject *CPyList_GetItem(PyObject *list, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
PyObject *result = PyList_GET_ITEM(list, n);
Py_INCREF(result);
return result;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
PyObject *CPyList_GetItemBorrow(PyObject *list, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
return PyList_GET_ITEM(list, n);
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
PyObject *CPyList_GetItemInt64(PyObject *list, int64_t index) {
size_t size = PyList_GET_SIZE(list);
if (likely((uint64_t)index < size)) {
PyObject *result = PyList_GET_ITEM(list, index);
Py_INCREF(result);
return result;
}
if (index >= 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
index += size;
if (index < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
PyObject *result = PyList_GET_ITEM(list, index);
Py_INCREF(result);
return result;
}
PyObject *CPyList_GetItemInt64Borrow(PyObject *list, int64_t index) {
size_t size = PyList_GET_SIZE(list);
if (likely((uint64_t)index < size)) {
return PyList_GET_ITEM(list, index);
}
if (index >= 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
index += size;
if (index < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
return PyList_GET_ITEM(list, index);
}
bool CPyList_SetItem(PyObject *list, CPyTagged index, PyObject *value) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
}
// PyList_SET_ITEM doesn't decref the old element, so we do
Py_DECREF(PyList_GET_ITEM(list, n));
// N.B: Steals reference
PyList_SET_ITEM(list, n, value);
return true;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return false;
}
}
bool CPyList_SetItemInt64(PyObject *list, int64_t index, PyObject *value) {
size_t size = PyList_GET_SIZE(list);
if (unlikely((uint64_t)index >= size)) {
if (index > 0) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
index += size;
if (index < 0) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
}
// PyList_SET_ITEM doesn't decref the old element, so we do
Py_DECREF(PyList_GET_ITEM(list, index));
// N.B: Steals reference
PyList_SET_ITEM(list, index, value);
return true;
}
// This function should only be used to fill in brand new lists.
void CPyList_SetItemUnsafe(PyObject *list, Py_ssize_t index, PyObject *value) {
PyList_SET_ITEM(list, index, value);
}
#ifdef Py_GIL_DISABLED
// The original optimized list.pop implementation doesn't work on free-threaded
// builds, so provide an alternative that is a bit slower but works.
//
// Note that this implementation isn't intended to be atomic.
static inline PyObject *list_pop_index(PyObject *list, Py_ssize_t index) {
PyObject *item = PyList_GetItemRef(list, index);
if (item == NULL) {
return NULL;
}
if (PySequence_DelItem(list, index) < 0) {
Py_DECREF(item);
return NULL;
}
return item;
}
#endif
PyObject *CPyList_PopLast(PyObject *list)
{
#ifdef Py_GIL_DISABLED
// The other implementation causes segfaults on a free-threaded Python 3.14b4 build.
Py_ssize_t index = PyList_GET_SIZE(list) - 1;
return list_pop_index(list, index);
#else
// I tried a specalized version of pop_impl for just removing the
// last element and it wasn't any faster in microbenchmarks than
// the generic one so I ditched it.
return list_pop_impl((PyListObject *)list, -1);
#endif
}
PyObject *CPyList_Pop(PyObject *obj, CPyTagged index)
{
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
#ifdef Py_GIL_DISABLED
// We must use a slower implementation on free-threaded builds.
if (n < 0) {
n += PyList_GET_SIZE(obj);
}
return list_pop_index(obj, n);
#else
return list_pop_impl((PyListObject *)obj, n);
#endif
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
CPyTagged CPyList_Count(PyObject *obj, PyObject *value)
{
return list_count((PyListObject *)obj, value);
}
int CPyList_Insert(PyObject *list, CPyTagged index, PyObject *value)
{
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
return PyList_Insert(list, n, value);
}
// The max range doesn't exactly coincide with ssize_t, but we still
// want to keep the error message compatible with CPython.
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
PyObject *CPyList_Extend(PyObject *o1, PyObject *o2) {
if (PyList_Extend(o1, o2) < 0) {
return NULL;
}
Py_RETURN_NONE;
}
// Return -2 or error, -1 if not found, or index of first match otherwise.
static Py_ssize_t _CPyList_Find(PyObject *list, PyObject *obj) {
Py_ssize_t i;
for (i = 0; i < Py_SIZE(list); i++) {
PyObject *item = PyList_GET_ITEM(list, i);
Py_INCREF(item);
int cmp = PyObject_RichCompareBool(item, obj, Py_EQ);
Py_DECREF(item);
if (cmp != 0) {
if (cmp > 0) {
return i;
} else {
return -2;
}
}
}
return -1;
}
int CPyList_Remove(PyObject *list, PyObject *obj) {
Py_ssize_t index = _CPyList_Find(list, obj);
if (index == -2) {
return -1;
}
if (index == -1) {
PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list");
return -1;
}
return PyList_SetSlice(list, index, index + 1, NULL);
}
CPyTagged CPyList_Index(PyObject *list, PyObject *obj) {
Py_ssize_t index = _CPyList_Find(list, obj);
if (index == -2) {
return CPY_INT_TAG;
}
if (index == -1) {
PyErr_SetString(PyExc_ValueError, "value is not in list");
return CPY_INT_TAG;
}
return index << 1;
}
PyObject *CPySequence_Sort(PyObject *seq) {
PyObject *newlist = PySequence_List(seq);
if (newlist == NULL)
return NULL;
int res = PyList_Sort(newlist);
if (res < 0) {
Py_DECREF(newlist);
return NULL;
}
return newlist;
}
PyObject *CPySequence_Multiply(PyObject *seq, CPyTagged t_size) {
Py_ssize_t size = CPyTagged_AsSsize_t(t_size);
if (size == -1 && PyErr_Occurred()) {
return NULL;
}
return PySequence_Repeat(seq, size);
}
PyObject *CPySequence_RMultiply(CPyTagged t_size, PyObject *seq) {
return CPySequence_Multiply(seq, t_size);
}
PyObject *CPySequence_InPlaceMultiply(PyObject *seq, CPyTagged t_size) {
Py_ssize_t size = CPyTagged_AsSsize_t(t_size);
if (size == -1 && PyErr_Occurred()) {
return NULL;
}
return PySequence_InPlaceRepeat(seq, size);
}
PyObject *CPyList_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (likely(PyList_CheckExact(obj)
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
if (startn < 0) {
startn += PyList_GET_SIZE(obj);
}
if (endn < 0) {
endn += PyList_GET_SIZE(obj);
}
return PyList_GetSlice(obj, startn, endn);
}
return CPyObject_GetSlice(obj, start, end);
}
int CPySequence_Check(PyObject *obj) {
return Py_TYPE(obj)->tp_flags & Py_TPFLAGS_SEQUENCE;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,21 @@
#include <Python.h>
PyMODINIT_FUNC
PyInit_{modname}(void)
{{
PyObject *tmp;
if (!(tmp = PyImport_ImportModule("{libname}"))) return NULL;
PyObject *capsule = PyObject_GetAttrString(tmp, "init_{full_modname}");
Py_DECREF(tmp);
if (capsule == NULL) return NULL;
void *init_func = PyCapsule_GetPointer(capsule, "{libname}.init_{full_modname}");
Py_DECREF(capsule);
if (!init_func) {{
return NULL;
}}
return ((PyObject *(*)(void))init_func)();
}}
// distutils sometimes spuriously tells cl to export CPyInit___init__,
// so provide that so it chills out
PyMODINIT_FUNC PyInit___init__(void) {{ return PyInit_{modname}(); }}

View file

@ -0,0 +1,41 @@
#include <Python.h>
static int {modname}_exec(PyObject *module)
{{
PyObject *tmp;
if (!(tmp = PyImport_ImportModule("{libname}"))) return -1;
PyObject *capsule = PyObject_GetAttrString(tmp, "exec_{full_modname}");
Py_DECREF(tmp);
if (capsule == NULL) return -1;
void *exec_func = PyCapsule_GetPointer(capsule, "{libname}.exec_{full_modname}");
Py_DECREF(capsule);
if (!exec_func) return -1;
if (((int (*)(PyObject *))exec_func)(module) != 0) return -1;
return 0;
}}
static PyModuleDef_Slot {modname}_slots[] = {{
{{Py_mod_exec, {modname}_exec}},
{{Py_mod_multiple_interpreters, Py_MOD_MULTIPLE_INTERPRETERS_NOT_SUPPORTED}},
{{Py_mod_gil, Py_MOD_GIL_NOT_USED}},
{{0, NULL}},
}};
static struct PyModuleDef {modname}_module = {{
PyModuleDef_HEAD_INIT,
.m_name = "{modname}",
.m_doc = NULL,
.m_methods = NULL,
.m_size = 0,
.m_slots = {modname}_slots,
}};
PyMODINIT_FUNC
PyInit_{modname}(void)
{{
return PyModuleDef_Init(&{modname}_module);
}}
// distutils sometimes spuriously tells cl to export CPyInit___init__,
// so provide that so it chills out
PyMODINIT_FUNC PyInit___init__(void) {{ return PyInit_{modname}(); }}

View file

@ -0,0 +1,200 @@
#ifndef MYPYC_UTIL_H
#define MYPYC_UTIL_H
#include <Python.h>
#include <frameobject.h>
#include <assert.h>
#if defined(__clang__) || defined(__GNUC__)
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define CPy_Unreachable() __builtin_unreachable()
#else
#define likely(x) (x)
#define unlikely(x) (x)
#define CPy_Unreachable() abort()
#endif
#if defined(__clang__) || defined(__GNUC__)
#define CPy_NOINLINE __attribute__((noinline))
#elif defined(_MSC_VER)
#define CPy_NOINLINE __declspec(noinline)
#else
#define CPy_NOINLINE
#endif
#ifndef Py_GIL_DISABLED
// Everything is running in the same thread, so no need for thread locals
#define CPyThreadLocal
#else
// 1. Use C11 standard thread_local storage, if available
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
#define CPyThreadLocal _Thread_local
// 2. Microsoft Visual Studio fallback
#elif defined(_MSC_VER)
#define CPyThreadLocal __declspec(thread)
// 3. GNU thread local storage for GCC/Clang targets that still need it
#elif defined(__GNUC__) || defined(__clang__)
#define CPyThreadLocal __thread
#else
#error "Can't define CPyThreadLocal for this compiler/target (consider using a non-free-threaded Python build)"
#endif
#endif // Py_GIL_DISABLED
// Helper macro for stringification in _Pragma
#define CPY_STRINGIFY(x) #x
#if defined(__clang__)
#define CPY_UNROLL_LOOP_IMPL(x) _Pragma(CPY_STRINGIFY(x))
#define CPY_UNROLL_LOOP(n) CPY_UNROLL_LOOP_IMPL(unroll n)
#elif defined(__GNUC__) && __GNUC__ >= 8
#define CPY_UNROLL_LOOP_IMPL(x) _Pragma(CPY_STRINGIFY(x))
#define CPY_UNROLL_LOOP(n) CPY_UNROLL_LOOP_IMPL(GCC unroll n)
#else
#define CPY_UNROLL_LOOP(n)
#endif
// INCREF and DECREF that assert the pointer is not NULL.
// asserts are disabled in release builds so there shouldn't be a perf hit.
// I'm honestly kind of surprised that this isn't done by default.
#define CPy_INCREF(p) do { assert(p); Py_INCREF(p); } while (0)
#define CPy_DECREF(p) do { assert(p); Py_DECREF(p); } while (0)
// Here just for consistency
#define CPy_XDECREF(p) Py_XDECREF(p)
#ifndef Py_GIL_DISABLED
// The *_NO_IMM operations below perform refcount manipulation for
// non-immortal objects (Python 3.12 and later).
//
// Py_INCREF and other CPython operations check for immortality. This
// can be expensive when we know that an object cannot be immortal.
//
// This optimization cannot be performed in free-threaded mode so we
// fall back to just calling the normal incref/decref operations.
static inline void CPy_INCREF_NO_IMM(PyObject *op)
{
op->ob_refcnt++;
}
static inline void CPy_DECREF_NO_IMM(PyObject *op)
{
if (--op->ob_refcnt == 0) {
_Py_Dealloc(op);
}
}
static inline void CPy_XDECREF_NO_IMM(PyObject *op)
{
if (op != NULL && --op->ob_refcnt == 0) {
_Py_Dealloc(op);
}
}
#define CPy_INCREF_NO_IMM(op) CPy_INCREF_NO_IMM((PyObject *)(op))
#define CPy_DECREF_NO_IMM(op) CPy_DECREF_NO_IMM((PyObject *)(op))
#define CPy_XDECREF_NO_IMM(op) CPy_XDECREF_NO_IMM((PyObject *)(op))
#else
#define CPy_INCREF_NO_IMM(op) CPy_INCREF(op)
#define CPy_DECREF_NO_IMM(op) CPy_DECREF(op)
#define CPy_XDECREF_NO_IMM(op) CPy_XDECREF(op)
#endif
// Tagged integer -- our representation of Python 'int' objects.
// Small enough integers are represented as unboxed integers (shifted
// left by 1); larger integers (larger than 63 bits on a 64-bit
// platform) are stored as a tagged pointer (PyObject *)
// representing a Python int object, with the lowest bit set.
// Tagged integers are always normalized. A small integer *must not*
// have the tag bit set.
typedef size_t CPyTagged;
typedef size_t CPyPtr;
#define CPY_INT_BITS (CHAR_BIT * sizeof(CPyTagged))
#define CPY_TAGGED_MAX (((Py_ssize_t)1 << (CPY_INT_BITS - 2)) - 1)
#define CPY_TAGGED_MIN (-((Py_ssize_t)1 << (CPY_INT_BITS - 2)))
#define CPY_TAGGED_ABS_MIN (0-(size_t)CPY_TAGGED_MIN)
typedef PyObject CPyModule;
// Tag bit used for long integers
#define CPY_INT_TAG 1
// Error value for signed fixed-width (low-level) integers
#define CPY_LL_INT_ERROR -113
// Error value for unsigned fixed-width (low-level) integers
#define CPY_LL_UINT_ERROR 239
// Error value for floats
#define CPY_FLOAT_ERROR -113.0
// Value for 'None' primitive type
#define CPY_NONE_ERROR 2
#define CPY_NONE 1
typedef void (*CPyVTableItem)(void);
static inline CPyTagged CPyTagged_ShortFromInt(int x) {
return x << 1;
}
static inline CPyTagged CPyTagged_ShortFromSsize_t(Py_ssize_t x) {
return x << 1;
}
// Are we targeting Python 3.X or newer?
#define CPY_3_11_FEATURES (PY_VERSION_HEX >= 0x030b0000)
#define CPY_3_12_FEATURES (PY_VERSION_HEX >= 0x030c0000)
#define CPY_3_14_FEATURES (PY_VERSION_HEX >= 0x030e0000)
#define CPY_3_15_FEATURES (PY_VERSION_HEX >= 0x030f0000)
#if CPY_3_12_FEATURES
// Same as macros in CPython internal/pycore_long.h, but with a CPY_ prefix
#define CPY_NON_SIZE_BITS 3
#define CPY_SIGN_ZERO 1
#define CPY_SIGN_NEGATIVE 2
#define CPY_SIGN_MASK 3
#define CPY_LONG_DIGIT(o, n) ((o)->long_value.ob_digit[n])
// Only available on Python 3.12 and later
#define CPY_LONG_TAG(o) ((o)->long_value.lv_tag)
#define CPY_LONG_IS_NEGATIVE(o) (((o)->long_value.lv_tag & CPY_SIGN_MASK) == CPY_SIGN_NEGATIVE)
// Only available on Python 3.12 and later
#define CPY_LONG_SIZE(o) ((o)->long_value.lv_tag >> CPY_NON_SIZE_BITS)
// Number of digits; negative for negative ints
#define CPY_LONG_SIZE_SIGNED(o) (CPY_LONG_IS_NEGATIVE(o) ? -CPY_LONG_SIZE(o) : CPY_LONG_SIZE(o))
// Number of digits, assuming int is non-negative
#define CPY_LONG_SIZE_UNSIGNED(o) CPY_LONG_SIZE(o)
#else
#define CPY_LONG_DIGIT(o, n) ((o)->ob_digit[n])
#define CPY_LONG_IS_NEGATIVE(o) (((o)->ob_base.ob_size < 0)
#define CPY_LONG_SIZE_SIGNED(o) ((o)->ob_base.ob_size)
#define CPY_LONG_SIZE_UNSIGNED(o) ((o)->ob_base.ob_size)
#endif
// Are we targeting Python 3.13 or newer?
#define CPY_3_13_FEATURES (PY_VERSION_HEX >= 0x030d0000)
// Are we targeting Python 3.14 or newer?
#define CPY_3_14_FEATURES (PY_VERSION_HEX >= 0x030e0000)
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,209 @@
// Collects code that was copied in from cpython, for a couple of different reasons:
// * We wanted to modify it to produce a more efficient version for our uses
// * We needed to call it and it was static :(
// * We wanted to call it and needed to backport it
#include "pythonsupport.h"
/////////////////////////////////////////
// Adapted from bltinmodule.c in Python 3.7.0
PyObject*
update_bases(PyObject *bases)
{
Py_ssize_t i, j;
PyObject *base, *meth, *new_base, *result, *new_bases = NULL;
PyObject *stack[1] = {bases};
assert(PyTuple_Check(bases));
Py_ssize_t nargs = PyTuple_GET_SIZE(bases);
for (i = 0; i < nargs; i++) {
base = PyTuple_GET_ITEM(bases, i);
if (PyType_Check(base)) {
if (new_bases) {
/* If we already have made a replacement, then we append every normal base,
otherwise just skip it. */
if (PyList_Append(new_bases, base) < 0) {
goto error;
}
}
continue;
}
if (PyObject_GetOptionalAttr(base, mypyc_interned_str.__mro_entries__, &meth) < 0) {
goto error;
}
if (!meth) {
if (new_bases) {
if (PyList_Append(new_bases, base) < 0) {
goto error;
}
}
continue;
}
new_base = PyObject_Vectorcall(meth, stack, 1, NULL);
Py_DECREF(meth);
if (!new_base) {
goto error;
}
if (!PyTuple_Check(new_base)) {
PyErr_SetString(PyExc_TypeError,
"__mro_entries__ must return a tuple");
Py_DECREF(new_base);
goto error;
}
if (!new_bases) {
/* If this is a first successful replacement, create new_bases list and
copy previously encountered bases. */
if (!(new_bases = PyList_New(i))) {
goto error;
}
for (j = 0; j < i; j++) {
base = PyTuple_GET_ITEM(bases, j);
PyList_SET_ITEM(new_bases, j, base);
Py_INCREF(base);
}
}
j = PyList_GET_SIZE(new_bases);
if (PyList_SetSlice(new_bases, j, j, new_base) < 0) {
goto error;
}
Py_DECREF(new_base);
}
if (!new_bases) {
return bases;
}
result = PyList_AsTuple(new_bases);
Py_DECREF(new_bases);
return result;
error:
Py_XDECREF(new_bases);
return NULL;
}
// From Python 3.7's typeobject.c
int
init_subclass(PyTypeObject *type, PyObject *kwds)
{
PyObject *super, *func, *result;
PyObject *args[2] = {(PyObject *)type, (PyObject *)type};
super = PyObject_Vectorcall((PyObject *)&PySuper_Type, args, 2, NULL);
if (super == NULL) {
return -1;
}
func = PyObject_GetAttr(super, mypyc_interned_str.__init_subclass__);
Py_DECREF(super);
if (func == NULL) {
return -1;
}
result = _PyObject_FastCallDict(func, NULL, 0, kwds);
Py_DECREF(func);
if (result == NULL) {
return -1;
}
Py_DECREF(result);
return 0;
}
#if CPY_3_12_FEATURES
// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined)
Py_ssize_t
CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow)
{
PyLongObject *v = (PyLongObject *)vv;
size_t x, prev;
Py_ssize_t res;
Py_ssize_t i;
int sign;
*overflow = 0;
res = -1;
i = CPY_LONG_TAG(v);
sign = 1;
x = 0;
if (i & CPY_SIGN_NEGATIVE) {
sign = -1;
}
i >>= CPY_NON_SIZE_BITS;
while (--i >= 0) {
prev = x;
x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i);
if ((x >> PyLong_SHIFT) != prev) {
*overflow = sign;
goto exit;
}
}
/* Haven't lost any bits, but casting to long requires extra
* care.
*/
if (x <= (size_t)CPY_TAGGED_MAX) {
res = (Py_ssize_t)x * sign;
}
else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) {
res = CPY_TAGGED_MIN;
}
else {
*overflow = sign;
/* res is already set to -1 */
}
exit:
return res;
}
#else
// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined, Python 3.11 and earlier)
Py_ssize_t
CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow)
{
/* This version by Tim Peters */
PyLongObject *v = (PyLongObject *)vv;
size_t x, prev;
Py_ssize_t res;
Py_ssize_t i;
int sign;
*overflow = 0;
res = -1;
i = Py_SIZE(v);
sign = 1;
x = 0;
if (i < 0) {
sign = -1;
i = -(i);
}
while (--i >= 0) {
prev = x;
x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i);
if ((x >> PyLong_SHIFT) != prev) {
*overflow = sign;
goto exit;
}
}
/* Haven't lost any bits, but casting to long requires extra
* care.
*/
if (x <= (size_t)CPY_TAGGED_MAX) {
res = (Py_ssize_t)x * sign;
}
else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) {
res = CPY_TAGGED_MIN;
}
else {
*overflow = sign;
/* res is already set to -1 */
}
exit:
return res;
}
#endif

View file

@ -0,0 +1,363 @@
// Collects code that was copied in from cpython, for a couple of different reasons:
// * We wanted to modify it to produce a more efficient version for our uses
// * We needed to call it and it was static :(
// * We wanted to call it and needed to backport it
#ifndef CPY_PYTHONSUPPORT_H
#define CPY_PYTHONSUPPORT_H
#include <stdbool.h>
#include <Python.h>
#include "pythoncapi_compat.h"
#include <frameobject.h>
#include <assert.h>
#include "static_data.h"
#include "mypyc_util.h"
#if CPY_3_13_FEATURES
#ifndef Py_BUILD_CORE
#define Py_BUILD_CORE
#endif
#include "internal/pycore_genobject.h" // _PyGen_FetchStopIterationValue
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause, _PyErr_SetKeyError
#include "internal/pycore_setobject.h" // _PySet_Update
#endif
#if CPY_3_12_FEATURES
#include "internal/pycore_frame.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
#if 0
} // why isn't emacs smart enough to not indent this
#endif
PyObject* update_bases(PyObject *bases);
int init_subclass(PyTypeObject *type, PyObject *kwds);
Py_ssize_t
CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow);
#if CPY_3_12_FEATURES
static inline Py_ssize_t
CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow)
{
/* This version by Tim Peters */
PyLongObject *v = (PyLongObject *)vv;
Py_ssize_t res;
Py_ssize_t i;
*overflow = 0;
res = -1;
i = CPY_LONG_TAG(v);
// TODO: Combine zero and non-zero cases helow?
if (likely(i == (1 << CPY_NON_SIZE_BITS))) {
res = CPY_LONG_DIGIT(v, 0);
} else if (likely(i == CPY_SIGN_ZERO)) {
res = 0;
} else if (i == ((1 << CPY_NON_SIZE_BITS) | CPY_SIGN_NEGATIVE)) {
res = -(sdigit)CPY_LONG_DIGIT(v, 0);
} else {
// Slow path is moved to a non-inline helper function to
// limit size of generated code
int overflow_local;
res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local);
*overflow = overflow_local;
}
return res;
}
#else
// Adapted from longobject.c in Python 3.7.0
/* This function adapted from PyLong_AsLongLongAndOverflow, but with
* some safety checks removed and specialized to only work for objects
* that are already longs.
* About half of the win this provides, though, just comes from being
* able to inline the function, which in addition to saving function call
* overhead allows the out-parameter overflow flag to be collapsed into
* control flow.
* Additionally, we check against the possible range of CPyTagged, not of
* Py_ssize_t. */
static inline Py_ssize_t
CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow)
{
/* This version by Tim Peters */
PyLongObject *v = (PyLongObject *)vv;
Py_ssize_t res;
Py_ssize_t i;
*overflow = 0;
res = -1;
i = Py_SIZE(v);
if (likely(i == 1)) {
res = CPY_LONG_DIGIT(v, 0);
} else if (likely(i == 0)) {
res = 0;
} else if (i == -1) {
res = -(sdigit)CPY_LONG_DIGIT(v, 0);
} else {
// Slow path is moved to a non-inline helper function to
// limit size of generated code
int overflow_local;
res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local);
*overflow = overflow_local;
}
return res;
}
#endif
// Adapted from listobject.c in Python 3.7.0
static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
PyObject **items;
size_t new_allocated, num_allocated_bytes;
Py_ssize_t allocated = self->allocated;
/* Bypass realloc() when a previous overallocation is large enough
to accommodate the newsize. If the newsize falls lower than half
the allocated size, then proceed with the realloc() to shrink the list.
*/
if (allocated >= newsize && newsize >= (allocated >> 1)) {
assert(self->ob_item != NULL || newsize == 0);
Py_SET_SIZE(self, newsize);
return 0;
}
/* This over-allocates proportional to the list size, making room
* for additional growth. The over-allocation is mild, but is
* enough to give linear-time amortized behavior over a long
* sequence of appends() in the presence of a poorly-performing
* system realloc().
* The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
* Note: new_allocated won't overflow because the largest possible value
* is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t.
*/
new_allocated = (size_t)newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6);
if (new_allocated > (size_t)PY_SSIZE_T_MAX / sizeof(PyObject *)) {
PyErr_NoMemory();
return -1;
}
if (newsize == 0)
new_allocated = 0;
num_allocated_bytes = new_allocated * sizeof(PyObject *);
items = (PyObject **)PyMem_Realloc(self->ob_item, num_allocated_bytes);
if (items == NULL) {
PyErr_NoMemory();
return -1;
}
self->ob_item = items;
Py_SET_SIZE(self, newsize);
self->allocated = new_allocated;
return 0;
}
// Changed to use PyList_SetSlice instead of the internal list_ass_slice
static PyObject *
list_pop_impl(PyListObject *self, Py_ssize_t index)
{
PyObject *v;
int status;
if (Py_SIZE(self) == 0) {
/* Special-case most common failure cause */
PyErr_SetString(PyExc_IndexError, "pop from empty list");
return NULL;
}
if (index < 0)
index += Py_SIZE(self);
if (index < 0 || index >= Py_SIZE(self)) {
PyErr_SetString(PyExc_IndexError, "pop index out of range");
return NULL;
}
v = self->ob_item[index];
if (index == Py_SIZE(self) - 1) {
status = list_resize(self, Py_SIZE(self) - 1);
if (status >= 0)
return v; /* and v now owns the reference the list had */
else
return NULL;
}
Py_INCREF(v);
status = PyList_SetSlice((PyObject *)self, index, index+1, (PyObject *)NULL);
if (status < 0) {
Py_DECREF(v);
return NULL;
}
return v;
}
// Tweaked to directly use CPyTagged
static CPyTagged
list_count(PyListObject *self, PyObject *value)
{
Py_ssize_t count = 0;
Py_ssize_t i;
for (i = 0; i < Py_SIZE(self); i++) {
int cmp = PyObject_RichCompareBool(self->ob_item[i], value, Py_EQ);
if (cmp > 0)
count++;
else if (cmp < 0)
return CPY_INT_TAG;
}
return CPyTagged_ShortFromSsize_t(count);
}
// Adapted from genobject.c in Python 3.7.2
// Copied because it wasn't in 3.5.2 and it is undocumented anyways.
/*
* Set StopIteration with specified value. Value can be arbitrary object
* or NULL.
*
* Returns 0 if StopIteration is set and -1 if any other exception is set.
*/
static int
CPyGen_SetStopIterationValue(PyObject *value)
{
PyObject *e;
if (value == NULL ||
(!PyTuple_Check(value) && !PyExceptionInstance_Check(value)))
{
/* Delay exception instantiation if we can */
PyErr_SetObject(PyExc_StopIteration, value);
return 0;
}
/* Construct an exception instance manually with
* PyObject_CallOneArg and pass it to PyErr_SetObject.
*
* We do this to handle a situation when "value" is a tuple, in which
* case PyErr_SetObject would set the value of StopIteration to
* the first element of the tuple.
*
* (See PyErr_SetObject/_PyErr_CreateException code for details.)
*/
e = PyObject_CallOneArg(PyExc_StopIteration, value);
if (e == NULL) {
return -1;
}
PyErr_SetObject(PyExc_StopIteration, e);
Py_DECREF(e);
return 0;
}
// Copied from dictobject.c and dictobject.h, these are not Public before
// Python 3.8. Also remove some error checks that we do in the callers.
typedef struct {
PyObject_HEAD
PyDictObject *dv_dict;
} _CPyDictViewObject;
static PyObject *
_CPyDictView_New(PyObject *dict, PyTypeObject *type)
{
_CPyDictViewObject *dv = PyObject_GC_New(_CPyDictViewObject, type);
if (dv == NULL)
return NULL;
Py_INCREF(dict);
dv->dv_dict = (PyDictObject *)dict;
PyObject_GC_Track(dv);
return (PyObject *)dv;
}
#ifdef __cplusplus
}
#endif
#if CPY_3_12_FEATURES
// These are copied from genobject.c in Python 3.12
static int
gen_is_coroutine(PyObject *o)
{
if (PyGen_CheckExact(o)) {
PyCodeObject *code = PyGen_GetCode((PyGenObject*)o);
if (code->co_flags & CO_ITERABLE_COROUTINE) {
return 1;
}
}
return 0;
}
#else
// Copied from genobject.c in Python 3.10
static int
gen_is_coroutine(PyObject *o)
{
if (PyGen_CheckExact(o)) {
PyCodeObject *code = (PyCodeObject *)((PyGenObject*)o)->gi_code;
if (code->co_flags & CO_ITERABLE_COROUTINE) {
return 1;
}
}
return 0;
}
#endif
/*
* This helper function returns an awaitable for `o`:
* - `o` if `o` is a coroutine-object;
* - `type(o)->tp_as_async->am_await(o)`
*
* Raises a TypeError if it's not possible to return
* an awaitable and returns NULL.
*/
static PyObject *
CPyCoro_GetAwaitableIter(PyObject *o)
{
unaryfunc getter = NULL;
PyTypeObject *ot;
if (PyCoro_CheckExact(o) || gen_is_coroutine(o)) {
/* 'o' is a coroutine. */
Py_INCREF(o);
return o;
}
ot = Py_TYPE(o);
if (ot->tp_as_async != NULL) {
getter = ot->tp_as_async->am_await;
}
if (getter != NULL) {
PyObject *res = (*getter)(o);
if (res != NULL) {
if (PyCoro_CheckExact(res) || gen_is_coroutine(res)) {
/* __await__ must return an *iterator*, not
a coroutine or another awaitable (see PEP 492) */
PyErr_SetString(PyExc_TypeError,
"__await__() returned a coroutine");
Py_CLEAR(res);
} else if (!PyIter_Check(res)) {
PyErr_Format(PyExc_TypeError,
"__await__() returned non-iterator "
"of type '%.100s'",
Py_TYPE(res)->tp_name);
Py_CLEAR(res);
}
}
return res;
}
PyErr_Format(PyExc_TypeError,
"object %.100s can't be used in 'await' expression",
ot->tp_name);
return NULL;
}
#endif

View file

@ -0,0 +1,17 @@
// Set primitive operations
//
// These are registered in mypyc.primitives.set_ops.
#include <Python.h>
#include "CPy.h"
bool CPySet_Remove(PyObject *set, PyObject *key) {
int success = PySet_Discard(set, key);
if (success == 1) {
return true;
}
if (success == 0) {
_PyErr_SetKeyError(key);
}
return false;
}

View file

@ -0,0 +1,75 @@
#ifndef STATIC_DATA
#define STATIC_DATA
#include "static_data.h"
// Adopted from numpy 2.4.0: numpy/_core/src/multiarry/npy_static_data.c
mypyc_interned_str_struct mypyc_interned_str;
#define INTERN_STRING(struct_member, string) \
assert(mypyc_interned_str.struct_member == NULL); \
mypyc_interned_str.struct_member = PyUnicode_InternFromString(string); \
if (mypyc_interned_str.struct_member == NULL) { \
return -1; \
}
int
intern_strings(void) {
if (mypyc_interned_str.values != NULL) {
// Already interned.
return 0;
}
INTERN_STRING(__init_subclass__, "__init_subclass__");
INTERN_STRING(__module__, "__module__");
INTERN_STRING(__mro_entries__, "__mro_entries__");
INTERN_STRING(__mypyc_attrs__, "__mypyc_attrs__");
INTERN_STRING(__name__, "__name__");
INTERN_STRING(__orig_bases__, "__orig_bases__");
INTERN_STRING(__qualname__, "__qualname__");
INTERN_STRING(__slots__, "__slots__");
INTERN_STRING(__radd__, "__radd__");
INTERN_STRING(__rsub__, "__rsub__");
INTERN_STRING(__rmul__, "__rmul__");
INTERN_STRING(__rtruediv__, "__rtruediv__");
INTERN_STRING(__rmod__, "__rmod__");
INTERN_STRING(__rdivmod__, "__rdivmod__");
INTERN_STRING(__rfloordiv__, "__rfloordiv__");
INTERN_STRING(__rpow__, "__rpow__");
INTERN_STRING(__rmatmul__, "__rmatmul__");
INTERN_STRING(__rand__, "__rand__");
INTERN_STRING(__ror__, "__ror__");
INTERN_STRING(__rxor__, "__rxor__");
INTERN_STRING(__rlshift__, "__rlshift__");
INTERN_STRING(__rrshift__, "__rrshift__");
INTERN_STRING(__eq__, "__eq__");
INTERN_STRING(__ne__, "__ne__");
INTERN_STRING(__gt__, "__gt__");
INTERN_STRING(__le__, "__le__");
INTERN_STRING(__lt__, "__lt__");
INTERN_STRING(__ge__, "__ge__");
INTERN_STRING(clear, "clear");
INTERN_STRING(close_, "close");
INTERN_STRING(copy, "copy");
INTERN_STRING(dispatch_cache, "dispatch_cache");
INTERN_STRING(endswith, "endswith");
INTERN_STRING(get_type_hints, "get_type_hints");
INTERN_STRING(keys, "keys");
INTERN_STRING(lower, "lower");
INTERN_STRING(items, "items");
INTERN_STRING(join, "join");
INTERN_STRING(register_, "register");
INTERN_STRING(registry, "registry");
INTERN_STRING(send, "send");
INTERN_STRING(setdefault, "setdefault");
INTERN_STRING(startswith, "startswith");
INTERN_STRING(super, "super");
INTERN_STRING(throw_, "throw");
INTERN_STRING(translate, "translate");
INTERN_STRING(update, "update");
INTERN_STRING(upper, "upper");
INTERN_STRING(values, "values");
return 0;
}
#endif

View file

@ -0,0 +1,72 @@
#ifndef STATIC_DATA_H
#define STATIC_DATA_H
#include <Python.h>
#ifdef __cplusplus
extern "C" {
#endif
// Adopted from numpy 2.4.0: numpy/_core/src/multiarry/npy_static_data.h
int intern_strings(void);
typedef struct mypyc_interned_str_struct {
PyObject *__init_subclass__;
PyObject *__module__;
PyObject *__mro_entries__;
PyObject *__mypyc_attrs__;
PyObject *__orig_bases__;
PyObject *__qualname__;
PyObject *__slots__;
PyObject *__name__;
PyObject *__radd__;
PyObject *__rsub__;
PyObject *__rmul__;
PyObject *__rtruediv__;
PyObject *__rmod__;
PyObject *__rdivmod__;
PyObject *__rfloordiv__;
PyObject *__rpow__;
PyObject *__rmatmul__;
PyObject *__rand__;
PyObject *__ror__;
PyObject *__rxor__;
PyObject *__rlshift__;
PyObject *__rrshift__;
PyObject *__eq__;
PyObject *__ne__;
PyObject *__gt__;
PyObject *__le__;
PyObject *__lt__;
PyObject *__ge__;
PyObject *clear;
PyObject *close_;
PyObject *copy;
PyObject *dispatch_cache;
PyObject *endswith;
PyObject *get_type_hints;
PyObject *keys;
PyObject *lower;
PyObject *items;
PyObject *join;
PyObject *register_;
PyObject *registry;
PyObject *send;
PyObject *setdefault;
PyObject *startswith;
PyObject *super;
PyObject *throw_;
PyObject *translate;
PyObject *update;
PyObject *upper;
PyObject *values;
} mypyc_interned_str_struct;
extern mypyc_interned_str_struct mypyc_interned_str;
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,4 @@
#include "str_extra_ops.h"
// All str extra ops are inline functions in str_extra_ops.h
// This file exists to satisfy the SourceDep requirements

View file

@ -0,0 +1,29 @@
#ifndef MYPYC_STR_EXTRA_OPS_H
#define MYPYC_STR_EXTRA_OPS_H
#include <Python.h>
#include <stdint.h>
#include "CPy.h"
// Optimized str indexing for ord(s[i])
// If index is negative, convert to non-negative index (no range checking)
static inline int64_t CPyStr_AdjustIndex(PyObject *obj, int64_t index) {
if (index < 0) {
return index + PyUnicode_GET_LENGTH(obj);
}
return index;
}
// Check if index is in valid range [0, len)
static inline bool CPyStr_RangeCheck(PyObject *obj, int64_t index) {
return index >= 0 && index < PyUnicode_GET_LENGTH(obj);
}
// Get character at index as int (ord value) - no bounds checking, returns as CPyTagged
static inline CPyTagged CPyStr_GetItemUnsafeAsInt(PyObject *obj, int64_t index) {
int kind = PyUnicode_KIND(obj);
return PyUnicode_READ(kind, PyUnicode_DATA(obj), index) << 1;
}
#endif

View file

@ -0,0 +1,793 @@
#include "pythoncapi_compat.h"
// String primitive operations
//
// These are registered in mypyc.primitives.str_ops.
#include <Python.h>
#include "CPy.h"
// The _PyUnicode_CheckConsistency definition has been moved to the internal API
// https://github.com/python/cpython/pull/106398
#if defined(Py_DEBUG) && CPY_3_13_FEATURES
#include "internal/pycore_unicodeobject.h"
#endif
// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
#define BLOOM_MASK unsigned long
#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#if LONG_BIT >= 128
#define BLOOM_WIDTH 128
#elif LONG_BIT >= 64
#define BLOOM_WIDTH 64
#elif LONG_BIT >= 32
#define BLOOM_WIDTH 32
#else
#error "LONG_BIT is smaller than 32"
#endif
// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
// This is needed for str.strip("...").
static inline BLOOM_MASK
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
{
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
do { \
TYPE *data = (TYPE *)PTR; \
TYPE *end = data + LEN; \
Py_UCS4 ch; \
for (; data != end; data++) { \
ch = *data; \
MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
} \
break; \
} while (0)
/* calculate simple bloom-style bitmask for a given unicode string */
BLOOM_MASK mask;
mask = 0;
switch (kind) {
case PyUnicode_1BYTE_KIND:
BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
break;
case PyUnicode_2BYTE_KIND:
BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
break;
case PyUnicode_4BYTE_KIND:
BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
break;
default:
Py_UNREACHABLE();
}
return mask;
#undef BLOOM_UPDATE
}
static inline char _CPyStr_Equal_NoIdentCheck(PyObject *str1, PyObject *str2, Py_ssize_t str2_length) {
// This helper function only exists to deduplicate code in CPyStr_Equal and CPyStr_EqualLiteral
Py_ssize_t str1_length = PyUnicode_GET_LENGTH(str1);
if (str1_length != str2_length)
return 0;
int kind = PyUnicode_KIND(str1);
if (PyUnicode_KIND(str2) != kind)
return 0;
const void *data1 = PyUnicode_DATA(str1);
const void *data2 = PyUnicode_DATA(str2);
return memcmp(data1, data2, str1_length * kind) == 0;
}
// Adapted from CPython 3.13.1 (_PyUnicode_Equal)
char CPyStr_Equal(PyObject *str1, PyObject *str2) {
if (str1 == str2) {
return 1;
}
Py_ssize_t str2_length = PyUnicode_GET_LENGTH(str2);
return _CPyStr_Equal_NoIdentCheck(str1, str2, str2_length);
}
char CPyStr_EqualLiteral(PyObject *str, PyObject *literal_str, Py_ssize_t literal_length) {
if (str == literal_str) {
return 1;
}
return _CPyStr_Equal_NoIdentCheck(str, literal_str, literal_length);
}
PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
if (PyUnicode_READY(str) != -1) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyUnicode_GET_LENGTH(str);
if (n < 0)
n += size;
if (n < 0 || n >= size) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return NULL;
}
enum PyUnicode_Kind kind = (enum PyUnicode_Kind)PyUnicode_KIND(str);
void *data = PyUnicode_DATA(str);
Py_UCS4 ch = PyUnicode_READ(kind, data, n);
if (ch < 256) {
// Latin-1 single-char strings are cached by CPython, so
// PyUnicode_FromOrdinal returns the cached object (with a
// new reference) instead of allocating a new string each time.
return PyUnicode_FromOrdinal(ch);
}
PyObject *unicode = PyUnicode_New(1, ch);
if (unicode == NULL)
return NULL;
if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
} else {
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
PyUnicode_4BYTE_DATA(unicode)[0] = ch;
}
return unicode;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
} else {
PyObject *index_obj = CPyTagged_AsObject(index);
return PyObject_GetItem(str, index_obj);
}
}
PyObject *CPyStr_GetItemUnsafe(PyObject *str, Py_ssize_t index) {
// This is unsafe since we don't check for overflow when doing <<.
return CPyStr_GetItem(str, index << 1);
}
// A simplification of _PyUnicode_JoinArray() from CPython 3.9.6
PyObject *CPyStr_Build(Py_ssize_t len, ...) {
Py_ssize_t i;
va_list args;
// Calculate the total amount of space and check
// whether all components have the same kind.
Py_ssize_t sz = 0;
Py_UCS4 maxchar = 0;
int use_memcpy = 1; // Use memcpy by default
PyObject *last_obj = NULL;
va_start(args, len);
for (i = 0; i < len; i++) {
PyObject *item = va_arg(args, PyObject *);
if (!PyUnicode_Check(item)) {
PyErr_Format(PyExc_TypeError,
"sequence item %zd: expected str instance,"
" %.80s found",
i, Py_TYPE(item)->tp_name);
return NULL;
}
if (PyUnicode_READY(item) == -1)
return NULL;
size_t add_sz = PyUnicode_GET_LENGTH(item);
Py_UCS4 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
maxchar = Py_MAX(maxchar, item_maxchar);
// Using size_t to avoid overflow during arithmetic calculation
if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
PyErr_SetString(PyExc_OverflowError,
"join() result is too long for a Python string");
return NULL;
}
sz += add_sz;
// If these strings have different kind, we would call
// _PyUnicode_FastCopyCharacters() in the following part.
if (use_memcpy && last_obj != NULL) {
if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
use_memcpy = 0;
}
last_obj = item;
}
va_end(args);
// Construct the string
PyObject *res = PyUnicode_New(sz, maxchar);
if (res == NULL)
return NULL;
if (use_memcpy) {
unsigned char *res_data = PyUnicode_1BYTE_DATA(res);
unsigned int kind = PyUnicode_KIND(res);
va_start(args, len);
for (i = 0; i < len; ++i) {
PyObject *item = va_arg(args, PyObject *);
Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
if (itemlen != 0) {
memcpy(res_data, PyUnicode_DATA(item), kind * itemlen);
res_data += kind * itemlen;
}
}
va_end(args);
assert(res_data == PyUnicode_1BYTE_DATA(res) + kind * PyUnicode_GET_LENGTH(res));
} else {
Py_ssize_t res_offset = 0;
va_start(args, len);
for (i = 0; i < len; ++i) {
PyObject *item = va_arg(args, PyObject *);
Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
if (itemlen != 0) {
#if CPY_3_13_FEATURES
PyUnicode_CopyCharacters(res, res_offset, item, 0, itemlen);
#else
_PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
#endif
res_offset += itemlen;
}
}
va_end(args);
assert(res_offset == PyUnicode_GET_LENGTH(res));
}
#ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(res, 1));
#endif
return res;
}
CPyTagged CPyStr_Find(PyObject *str, PyObject *substr, CPyTagged start, int direction) {
CPyTagged end = PyUnicode_GET_LENGTH(str) << 1;
return CPyStr_FindWithEnd(str, substr, start, end, direction);
}
CPyTagged CPyStr_FindWithEnd(PyObject *str, PyObject *substr, CPyTagged start, CPyTagged end, int direction) {
Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
if (temp_start == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return CPY_INT_TAG;
}
Py_ssize_t temp_end = CPyTagged_AsSsize_t(end);
if (temp_end == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return CPY_INT_TAG;
}
Py_ssize_t index = PyUnicode_Find(str, substr, temp_start, temp_end, direction);
if (unlikely(index == -2)) {
return CPY_INT_TAG;
}
return index << 1;
}
PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split) {
Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
if (temp_max_split == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PyUnicode_Split(str, sep, temp_max_split);
}
PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split) {
Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
if (temp_max_split == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PyUnicode_RSplit(str, sep, temp_max_split);
}
// This function has been copied from _PyUnicode_XStrip in cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
static PyObject *_PyStr_XStrip(PyObject *self, int striptype, PyObject *sepobj) {
const void *data;
int kind;
Py_ssize_t i, j, len;
BLOOM_MASK sepmask;
Py_ssize_t seplen;
// This check is needed from Python 3.9 and earlier.
if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
return NULL;
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self);
seplen = PyUnicode_GET_LENGTH(sepobj);
sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
PyUnicode_DATA(sepobj),
seplen);
i = 0;
if (striptype != RIGHTSTRIP) {
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
i++;
}
}
j = len;
if (striptype != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
j--;
}
j++;
}
return PyUnicode_Substring(self, i, j);
}
// Copied from do_strip function in cpython.git/Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep) {
if (sep == NULL || Py_IsNone(sep)) {
Py_ssize_t len, i, j;
// This check is needed from Python 3.9 and earlier.
if (PyUnicode_READY(self) == -1)
return NULL;
len = PyUnicode_GET_LENGTH(self);
if (PyUnicode_IS_ASCII(self)) {
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
i = 0;
if (strip_type != RIGHTSTRIP) {
while (i < len) {
Py_UCS1 ch = data[i];
if (!_Py_ascii_whitespace[ch])
break;
i++;
}
}
j = len;
if (strip_type != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS1 ch = data[j];
if (!_Py_ascii_whitespace[ch])
break;
j--;
}
j++;
}
}
else {
int kind = PyUnicode_KIND(self);
const void *data = PyUnicode_DATA(self);
i = 0;
if (strip_type != RIGHTSTRIP) {
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!Py_UNICODE_ISSPACE(ch))
break;
i++;
}
}
j = len;
if (strip_type != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!Py_UNICODE_ISSPACE(ch))
break;
j--;
}
j++;
}
}
return PyUnicode_Substring(self, i, j);
}
return _PyStr_XStrip(self, strip_type, sep);
}
PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr,
PyObject *new_substr, CPyTagged max_replace) {
Py_ssize_t temp_max_replace = CPyTagged_AsSsize_t(max_replace);
if (temp_max_replace == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PyUnicode_Replace(str, old_substr, new_substr, temp_max_replace);
}
int CPyStr_Startswith(PyObject *self, PyObject *subobj) {
Py_ssize_t start = 0;
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
PyObject *substring = PyTuple_GET_ITEM(subobj, i);
if (!PyUnicode_Check(substring)) {
PyErr_Format(PyExc_TypeError,
"tuple for startswith must only contain str, "
"not %.100s",
Py_TYPE(substring)->tp_name);
return 2;
}
int result = PyUnicode_Tailmatch(self, substring, start, end, -1);
if (result) {
return 1;
}
}
return 0;
}
return PyUnicode_Tailmatch(self, subobj, start, end, -1);
}
int CPyStr_Endswith(PyObject *self, PyObject *subobj) {
Py_ssize_t start = 0;
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
PyObject *substring = PyTuple_GET_ITEM(subobj, i);
if (!PyUnicode_Check(substring)) {
PyErr_Format(PyExc_TypeError,
"tuple for endswith must only contain str, "
"not %.100s",
Py_TYPE(substring)->tp_name);
return 2;
}
int result = PyUnicode_Tailmatch(self, substring, start, end, 1);
if (result) {
return 1;
}
}
return 0;
}
return PyUnicode_Tailmatch(self, subobj, start, end, 1);
}
PyObject *CPyStr_Removeprefix(PyObject *self, PyObject *prefix) {
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
int match = PyUnicode_Tailmatch(self, prefix, 0, end, -1);
if (match) {
Py_ssize_t prefix_end = PyUnicode_GET_LENGTH(prefix);
return PyUnicode_Substring(self, prefix_end, end);
}
return Py_NewRef(self);
}
PyObject *CPyStr_Removesuffix(PyObject *self, PyObject *suffix) {
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
int match = PyUnicode_Tailmatch(self, suffix, 0, end, 1);
if (match) {
Py_ssize_t suffix_end = PyUnicode_GET_LENGTH(suffix);
return PyUnicode_Substring(self, 0, end - suffix_end);
}
return Py_NewRef(self);
}
/* This does a dodgy attempt to append in place */
PyObject *CPyStr_Append(PyObject *o1, PyObject *o2) {
PyUnicode_Append(&o1, o2);
return o1;
}
PyObject *CPyStr_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (likely(PyUnicode_CheckExact(obj)
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
if (startn < 0) {
startn += PyUnicode_GET_LENGTH(obj);
if (startn < 0) {
startn = 0;
}
}
if (endn < 0) {
endn += PyUnicode_GET_LENGTH(obj);
if (endn < 0) {
endn = 0;
}
}
return PyUnicode_Substring(obj, startn, endn);
}
return CPyObject_GetSlice(obj, start, end);
}
/* Check if the given string is true (i.e. its length isn't zero) */
bool CPyStr_IsTrue(PyObject *obj) {
Py_ssize_t length = PyUnicode_GET_LENGTH(obj);
return length != 0;
}
Py_ssize_t CPyStr_Size_size_t(PyObject *str) {
if (PyUnicode_READY(str) != -1) {
return PyUnicode_GET_LENGTH(str);
}
return -1;
}
PyObject *CPy_Decode(PyObject *obj, PyObject *encoding, PyObject *errors) {
const char *enc = NULL;
const char *err = NULL;
if (encoding) {
enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
if (!enc) return NULL;
}
if (errors) {
err = PyUnicode_AsUTF8AndSize(errors, NULL);
if (!err) return NULL;
}
if (PyBytes_Check(obj)) {
return PyUnicode_Decode(((PyBytesObject *)obj)->ob_sval,
((PyVarObject *)obj)->ob_size,
enc, err);
} else {
return PyUnicode_FromEncodedObject(obj, enc, err);
}
}
PyObject *CPy_DecodeUTF8(PyObject *bytes) {
if (PyBytes_CheckExact(bytes)) {
char *buffer = PyBytes_AsString(bytes); // Borrowed reference
if (buffer == NULL) {
return NULL;
}
Py_ssize_t size = PyBytes_Size(bytes);
return PyUnicode_DecodeUTF8(buffer, size, "strict");
} else {
return PyUnicode_FromEncodedObject(bytes, "utf-8", "strict");
}
}
PyObject *CPy_DecodeASCII(PyObject *bytes) {
if (PyBytes_CheckExact(bytes)) {
char *buffer = PyBytes_AsString(bytes); // Borrowed reference
if (buffer == NULL) {
return NULL;
}
Py_ssize_t size = PyBytes_Size(bytes);
return PyUnicode_DecodeASCII(buffer, size, "strict");;
} else {
return PyUnicode_FromEncodedObject(bytes, "ascii", "strict");
}
}
PyObject *CPy_DecodeLatin1(PyObject *bytes) {
if (PyBytes_CheckExact(bytes)) {
char *buffer = PyBytes_AsString(bytes); // Borrowed reference
if (buffer == NULL) {
return NULL;
}
Py_ssize_t size = PyBytes_Size(bytes);
return PyUnicode_DecodeLatin1(buffer, size, "strict");
} else {
return PyUnicode_FromEncodedObject(bytes, "latin1", "strict");
}
}
PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors) {
const char *enc = NULL;
const char *err = NULL;
if (encoding) {
enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
if (!enc) return NULL;
}
if (errors) {
err = PyUnicode_AsUTF8AndSize(errors, NULL);
if (!err) return NULL;
}
if (PyUnicode_Check(obj)) {
return PyUnicode_AsEncodedString(obj, enc, err);
} else {
PyErr_BadArgument();
return NULL;
}
}
Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start) {
Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
if (temp_start == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
Py_ssize_t end = PyUnicode_GET_LENGTH(unicode);
return PyUnicode_Count(unicode, substring, temp_start, end);
}
Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end) {
Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
if (temp_start == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
Py_ssize_t temp_end = CPyTagged_AsSsize_t(end);
if (temp_end == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
return PyUnicode_Count(unicode, substring, temp_start, temp_end);
}
CPyTagged CPyStr_Ord(PyObject *obj) {
Py_ssize_t s = PyUnicode_GET_LENGTH(obj);
if (s == 1) {
int kind = PyUnicode_KIND(obj);
return PyUnicode_READ(kind, PyUnicode_DATA(obj), 0) << 1;
}
PyErr_Format(
PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s);
return CPY_INT_TAG;
}
PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count) {
Py_ssize_t temp_count = CPyTagged_AsSsize_t(count);
if (temp_count == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PySequence_Repeat(str, temp_count);
}
bool CPyStr_IsSpace(PyObject *str) {
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
if (len == 0) return false;
if (PyUnicode_IS_ASCII(str)) {
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(str);
for (Py_ssize_t i = 0; i < len; i++) {
if (!_Py_ascii_whitespace[data[i]])
return false;
}
return true;
}
int kind = PyUnicode_KIND(str);
const void *data = PyUnicode_DATA(str);
for (Py_ssize_t i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!Py_UNICODE_ISSPACE(ch))
return false;
}
return true;
}
bool CPyStr_IsAlnum(PyObject *str) {
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
if (len == 0) return false;
if (PyUnicode_IS_ASCII(str)) {
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(str);
for (Py_ssize_t i = 0; i < len; i++) {
if (!Py_ISALNUM(data[i]))
return false;
}
return true;
}
int kind = PyUnicode_KIND(str);
const void *data = PyUnicode_DATA(str);
for (Py_ssize_t i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!Py_UNICODE_ISALNUM(ch))
return false;
}
return true;
}
static inline int CPy_ASCII_Lower(unsigned char c) { return Py_TOLOWER(c); }
static inline int CPy_ASCII_Upper(unsigned char c) { return Py_TOUPPER(c); }
static inline PyObject *CPyStr_ChangeCase(PyObject *self,
int (*ascii_func)(unsigned char),
#if CPY_3_13_FEATURES
PyObject *method_name
#else
int (*unicode_func)(Py_UCS4, Py_UCS4 *)
#endif
) {
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
Py_INCREF(self);
return self;
}
// ASCII fast path: 1-to-1, no expansion possible
if (PyUnicode_IS_ASCII(self)) {
PyObject *res = PyUnicode_New(len, 127);
if (res == NULL) return NULL;
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = 0; i < len; i++) {
res_data[i] = ascii_func(data[i]);
}
return res;
}
#if CPY_3_13_FEATURES
// On 3.13+, _PyUnicode_ToLowerFull/ToUpperFull are no longer exported,
// so fall back to CPython's method implementation for non-ASCII strings.
return PyObject_CallMethodNoArgs(self, method_name);
#else
// General Unicode: unicode_func handles 1-to-N expansion.
// Worst case: each codepoint expands to 3 (per Unicode standard).
// The tmp buffer is short-lived, and PyUnicode_FromKindAndData
// compacts the result to the optimal string kind automatically.
int kind = PyUnicode_KIND(self);
const void *data = PyUnicode_DATA(self);
Py_UCS4 *tmp = PyMem_Malloc(sizeof(Py_UCS4) * len * 3);
if (tmp == NULL) return PyErr_NoMemory();
Py_UCS4 mapped[3];
Py_ssize_t out_len = 0;
for (Py_ssize_t i = 0; i < len; i++) {
int n = unicode_func(PyUnicode_READ(kind, data, i), mapped);
for (int j = 0; j < n; j++) {
tmp[out_len++] = mapped[j];
}
}
PyObject *res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, tmp, out_len);
PyMem_Free(tmp);
return res;
#endif
}
PyObject *CPyStr_Lower(PyObject *self) {
#if CPY_3_13_FEATURES
return CPyStr_ChangeCase(self, CPy_ASCII_Lower, mypyc_interned_str.lower);
#else
return CPyStr_ChangeCase(self, CPy_ASCII_Lower, _PyUnicode_ToLowerFull);
#endif
}
PyObject *CPyStr_Upper(PyObject *self) {
#if CPY_3_13_FEATURES
return CPyStr_ChangeCase(self, CPy_ASCII_Upper, mypyc_interned_str.upper);
#else
return CPyStr_ChangeCase(self, CPy_ASCII_Upper, _PyUnicode_ToUpperFull);
#endif
}
bool CPyStr_IsDigit(PyObject *str) {
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
if (len == 0) return false;
#define CHECK_ISDIGIT(TYPE, DATA, CHECK) \
{ \
const TYPE *data = (const TYPE *)(DATA); \
for (Py_ssize_t i = 0; i < len; i++) { \
if (!CHECK(data[i])) \
return false; \
} \
}
// ASCII fast path
if (PyUnicode_IS_ASCII(str)) {
CHECK_ISDIGIT(Py_UCS1, PyUnicode_1BYTE_DATA(str), Py_ISDIGIT);
return true;
}
switch (PyUnicode_KIND(str)) {
case PyUnicode_1BYTE_KIND:
CHECK_ISDIGIT(Py_UCS1, PyUnicode_1BYTE_DATA(str), Py_UNICODE_ISDIGIT);
break;
case PyUnicode_2BYTE_KIND:
CHECK_ISDIGIT(Py_UCS2, PyUnicode_2BYTE_DATA(str), Py_UNICODE_ISDIGIT);
break;
case PyUnicode_4BYTE_KIND:
CHECK_ISDIGIT(Py_UCS4, PyUnicode_4BYTE_DATA(str), Py_UNICODE_ISDIGIT);
break;
default:
Py_UNREACHABLE();
}
return true;
#undef CHECK_ISDIGIT
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,102 @@
#ifndef LIBRT_STRINGS_H
#define LIBRT_STRINGS_H
#ifndef MYPYC_EXPERIMENTAL
static int
import_librt_strings(void)
{
// All librt.base64 features are experimental for now, so don't set up the API here
return 0;
}
#else // MYPYC_EXPERIMENTAL
#include <stdbool.h>
#include <Python.h>
#include "librt_strings_common.h"
// ABI version -- only an exact match is compatible. This will only be changed in
// very exceptional cases (likely never) due to strict backward compatibility
// requirements.
#define LIBRT_STRINGS_ABI_VERSION 1
// API version -- more recent versions must maintain backward compatibility, i.e.
// we can add new features but not remove or change existing features (unless
// ABI version is changed, but see the comment above).
#define LIBRT_STRINGS_API_VERSION 4
// Number of functions in the capsule API. If you add a new function, also increase
// LIBRT_STRINGS_API_VERSION.
#define LIBRT_STRINGS_API_LEN 14
static void *LibRTStrings_API[LIBRT_STRINGS_API_LEN];
typedef struct {
PyObject_HEAD
char *buf; // Beginning of the buffer
char kind; // Bytes per code point (1, 2 or 4)
Py_ssize_t len; // Current length (number of code points written)
Py_ssize_t capacity; // Total capacity of the buffer (number of code points)
char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer
} StringWriterObject;
#define LibRTStrings_ABIVersion (*(int (*)(void)) LibRTStrings_API[0])
#define LibRTStrings_APIVersion (*(int (*)(void)) LibRTStrings_API[1])
#define LibRTStrings_BytesWriter_internal (*(PyObject* (*)(void)) LibRTStrings_API[2])
#define LibRTStrings_BytesWriter_getvalue_internal (*(PyObject* (*)(PyObject *source)) LibRTStrings_API[3])
#define LibRTStrings_BytesWriter_append_internal (*(char (*)(PyObject *source, uint8_t value)) LibRTStrings_API[4])
#define LibRTStrings_ByteWriter_grow_buffer_internal (*(bool (*)(BytesWriterObject *obj, Py_ssize_t size)) LibRTStrings_API[5])
#define LibRTStrings_BytesWriter_type_internal (*(PyTypeObject* (*)(void)) LibRTStrings_API[6])
#define LibRTStrings_BytesWriter_truncate_internal (*(char (*)(PyObject *self, int64_t size)) LibRTStrings_API[7])
#define LibRTStrings_StringWriter_internal (*(PyObject* (*)(void)) LibRTStrings_API[8])
#define LibRTStrings_StringWriter_getvalue_internal (*(PyObject* (*)(PyObject *source)) LibRTStrings_API[9])
#define LibRTStrings_string_append_slow_path (*(char (*)(StringWriterObject *obj, int32_t value)) LibRTStrings_API[10])
#define LibRTStrings_StringWriter_type_internal (*(PyTypeObject* (*)(void)) LibRTStrings_API[11])
#define LibRTStrings_StringWriter_write_internal (*(char (*)(PyObject *source, PyObject *value)) LibRTStrings_API[12])
#define LibRTStrings_grow_string_buffer (*(bool (*)(StringWriterObject *obj, Py_ssize_t n)) LibRTStrings_API[13])
static int
import_librt_strings(void)
{
PyObject *mod = PyImport_ImportModule("librt.strings");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
void *capsule = PyCapsule_Import("librt.strings._C_API", 0);
if (capsule == NULL)
return -1;
memcpy(LibRTStrings_API, capsule, sizeof(LibRTStrings_API));
if (LibRTStrings_ABIVersion() != LIBRT_STRINGS_ABI_VERSION) {
char err[128];
snprintf(err, sizeof(err), "ABI version conflict for librt.strings, expected %d, found %d",
LIBRT_STRINGS_ABI_VERSION,
LibRTStrings_ABIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
if (LibRTStrings_APIVersion() < LIBRT_STRINGS_API_VERSION) {
char err[128];
snprintf(err, sizeof(err),
"API version conflict for librt.strings, expected %d or newer, found %d (hint: upgrade librt)",
LIBRT_STRINGS_API_VERSION,
LibRTStrings_APIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
return 0;
}
static inline bool CPyBytesWriter_Check(PyObject *obj) {
return Py_TYPE(obj) == LibRTStrings_BytesWriter_type_internal();
}
static inline bool CPyStringWriter_Check(PyObject *obj) {
return Py_TYPE(obj) == LibRTStrings_StringWriter_type_internal();
}
#endif // MYPYC_EXPERIMENTAL
#endif // LIBRT_STRINGS_H

View file

@ -0,0 +1,352 @@
#ifndef LIBRT_STRINGS_COMMON_H
#define LIBRT_STRINGS_COMMON_H
#include <Python.h>
#include <stdint.h>
#include <string.h>
// Byte-swap functions for endianness conversion (needed for both LE and BE operations)
#if defined(_MSC_VER)
# include <stdlib.h>
# define BSWAP16(x) _byteswap_ushort(x)
# define BSWAP32(x) _byteswap_ulong(x)
# define BSWAP64(x) _byteswap_uint64(x)
#elif defined(__GNUC__) || defined(__clang__)
# define BSWAP16(x) __builtin_bswap16(x)
# define BSWAP32(x) __builtin_bswap32(x)
# define BSWAP64(x) __builtin_bswap64(x)
#else
// Fallback for other compilers (slower but portable)
static inline uint16_t BSWAP16(uint16_t x) {
return (uint16_t)((x >> 8) | (x << 8));
}
static inline uint32_t BSWAP32(uint32_t x) {
return ((x >> 24) & 0xFFU) |
((x >> 8) & 0xFF00U) |
((x << 8) & 0xFF0000U) |
((x << 24) & 0xFF000000U);
}
static inline uint64_t BSWAP64(uint64_t x) {
return ((x >> 56) & 0xFFULL) |
((x >> 40) & 0xFF00ULL) |
((x >> 24) & 0xFF0000ULL) |
((x >> 8) & 0xFF000000ULL) |
((x << 8) & 0xFF00000000ULL) |
((x << 24) & 0xFF0000000000ULL) |
((x << 40) & 0xFF000000000000ULL) |
((x << 56) & 0xFF00000000000000ULL);
}
#endif
// Length of the default buffer embedded directly in a BytesWriter object
#define WRITER_EMBEDDED_BUF_LEN 256
typedef struct {
PyObject_HEAD
char *buf; // Beginning of the buffer
Py_ssize_t len; // Current length (number of bytes written)
Py_ssize_t capacity; // Total capacity of the buffer
char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer
} BytesWriterObject;
// Write a 16-bit signed integer in little-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteI16LEUnsafe(BytesWriterObject *self, int16_t value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
uint16_t swapped = BSWAP16((uint16_t)value);
memcpy(self->buf + self->len, &swapped, 2);
#else
memcpy(self->buf + self->len, &value, 2);
#endif
self->len += 2;
}
// Write a 16-bit signed integer in big-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteI16BEUnsafe(BytesWriterObject *self, int16_t value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
memcpy(self->buf + self->len, &value, 2);
#else
uint16_t swapped = BSWAP16((uint16_t)value);
memcpy(self->buf + self->len, &swapped, 2);
#endif
self->len += 2;
}
// Read a 16-bit signed integer in little-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline int16_t
CPyBytes_ReadI16LEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
uint16_t value;
memcpy(&value, data, 2);
#if PY_BIG_ENDIAN
value = BSWAP16(value);
#endif
return (int16_t)value;
}
// Read a 16-bit signed integer in big-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline int16_t
CPyBytes_ReadI16BEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
uint16_t value;
memcpy(&value, data, 2);
#if PY_BIG_ENDIAN
// Already in big-endian format, no swap needed
#else
value = BSWAP16(value);
#endif
return (int16_t)value;
}
// Write a 32-bit signed integer in little-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteI32LEUnsafe(BytesWriterObject *self, int32_t value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
uint32_t swapped = BSWAP32((uint32_t)value);
memcpy(self->buf + self->len, &swapped, 4);
#else
memcpy(self->buf + self->len, &value, 4);
#endif
self->len += 4;
}
// Write a 32-bit signed integer in big-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteI32BEUnsafe(BytesWriterObject *self, int32_t value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
memcpy(self->buf + self->len, &value, 4);
#else
uint32_t swapped = BSWAP32((uint32_t)value);
memcpy(self->buf + self->len, &swapped, 4);
#endif
self->len += 4;
}
// Read a 32-bit signed integer in little-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline int32_t
CPyBytes_ReadI32LEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
uint32_t value;
memcpy(&value, data, 4);
#if PY_BIG_ENDIAN
value = BSWAP32(value);
#endif
return (int32_t)value;
}
// Read a 32-bit signed integer in big-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline int32_t
CPyBytes_ReadI32BEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
uint32_t value;
memcpy(&value, data, 4);
#if PY_BIG_ENDIAN
// Already in big-endian format, no swap needed
#else
value = BSWAP32(value);
#endif
return (int32_t)value;
}
// Write a 64-bit signed integer in little-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteI64LEUnsafe(BytesWriterObject *self, int64_t value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
uint64_t swapped = BSWAP64((uint64_t)value);
memcpy(self->buf + self->len, &swapped, 8);
#else
memcpy(self->buf + self->len, &value, 8);
#endif
self->len += 8;
}
// Write a 64-bit signed integer in big-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteI64BEUnsafe(BytesWriterObject *self, int64_t value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
memcpy(self->buf + self->len, &value, 8);
#else
uint64_t swapped = BSWAP64((uint64_t)value);
memcpy(self->buf + self->len, &swapped, 8);
#endif
self->len += 8;
}
// Read a 64-bit signed integer in little-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline int64_t
CPyBytes_ReadI64LEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
uint64_t value;
memcpy(&value, data, 8);
#if PY_BIG_ENDIAN
value = BSWAP64(value);
#endif
return (int64_t)value;
}
// Read a 64-bit signed integer in big-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline int64_t
CPyBytes_ReadI64BEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
uint64_t value;
memcpy(&value, data, 8);
#if PY_BIG_ENDIAN
// Already in big-endian format, no swap needed
#else
value = BSWAP64(value);
#endif
return (int64_t)value;
}
// Write a 32-bit float in little-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteF32LEUnsafe(BytesWriterObject *self, float value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
uint32_t bits;
memcpy(&bits, &value, 4);
bits = BSWAP32(bits);
memcpy(self->buf + self->len, &bits, 4);
#else
memcpy(self->buf + self->len, &value, 4);
#endif
self->len += 4;
}
// Write a 32-bit float in big-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteF32BEUnsafe(BytesWriterObject *self, float value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
memcpy(self->buf + self->len, &value, 4);
#else
uint32_t bits;
memcpy(&bits, &value, 4);
bits = BSWAP32(bits);
memcpy(self->buf + self->len, &bits, 4);
#endif
self->len += 4;
}
// Read a 32-bit float in little-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline float
CPyBytes_ReadF32LEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
float value;
#if PY_BIG_ENDIAN
uint32_t bits;
memcpy(&bits, data, 4);
bits = BSWAP32(bits);
memcpy(&value, &bits, 4);
#else
memcpy(&value, data, 4);
#endif
return value;
}
// Read a 32-bit float in big-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline float
CPyBytes_ReadF32BEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
float value;
#if PY_BIG_ENDIAN
memcpy(&value, data, 4);
#else
uint32_t bits;
memcpy(&bits, data, 4);
bits = BSWAP32(bits);
memcpy(&value, &bits, 4);
#endif
return value;
}
// Write a 64-bit float (double) in little-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteF64LEUnsafe(BytesWriterObject *self, double value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
uint64_t bits;
memcpy(&bits, &value, 8);
bits = BSWAP64(bits);
memcpy(self->buf + self->len, &bits, 8);
#else
memcpy(self->buf + self->len, &value, 8);
#endif
self->len += 8;
}
// Write a 64-bit float (double) in big-endian format to BytesWriter.
// NOTE: This does NOT check buffer capacity - caller must ensure space is available.
static inline void
BytesWriter_WriteF64BEUnsafe(BytesWriterObject *self, double value) {
// memcpy is reliably optimized to a single store by GCC, Clang, and MSVC
#if PY_BIG_ENDIAN
memcpy(self->buf + self->len, &value, 8);
#else
uint64_t bits;
memcpy(&bits, &value, 8);
bits = BSWAP64(bits);
memcpy(self->buf + self->len, &bits, 8);
#endif
self->len += 8;
}
// Read a 64-bit float (double) in little-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline double
CPyBytes_ReadF64LEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
double value;
#if PY_BIG_ENDIAN
uint64_t bits;
memcpy(&bits, data, 8);
bits = BSWAP64(bits);
memcpy(&value, &bits, 8);
#else
memcpy(&value, data, 8);
#endif
return value;
}
// Read a 64-bit float (double) in big-endian format from bytes.
// NOTE: This does NOT check bounds - caller must ensure valid index.
static inline double
CPyBytes_ReadF64BEUnsafe(const unsigned char *data) {
// memcpy is reliably optimized to a single load by GCC, Clang, and MSVC
double value;
#if PY_BIG_ENDIAN
memcpy(&value, data, 8);
#else
uint64_t bits;
memcpy(&bits, data, 8);
bits = BSWAP64(bits);
memcpy(&value, &bits, 8);
#endif
return value;
}
#endif // LIBRT_STRINGS_COMMON_H

View file

@ -0,0 +1,11 @@
// Primitives related to librt.strings.StringWriter that get linked statically
// with compiled modules, instead of being called via a capsule.
#include "stringwriter_extra_ops.h"
#ifdef MYPYC_EXPERIMENTAL
// All StringWriter operations are currently implemented as inline functions
// in stringwriter_extra_ops.h, or use the exported capsule API directly.
#endif // MYPYC_EXPERIMENTAL

View file

@ -0,0 +1,79 @@
#ifndef STRINGWRITER_EXTRA_OPS_H
#define STRINGWRITER_EXTRA_OPS_H
#ifdef MYPYC_EXPERIMENTAL
#include <stdbool.h>
#include <stdint.h>
#include <Python.h>
#include "mypyc_util.h"
#include "strings/librt_strings.h"
static inline CPyTagged
CPyStringWriter_Len(PyObject *obj) {
return (CPyTagged)((StringWriterObject *)obj)->len << 1;
}
static inline bool
CPyStringWriter_EnsureSize(StringWriterObject *data, Py_ssize_t n) {
if (likely(data->capacity - data->len >= n)) {
return true;
} else {
return LibRTStrings_grow_string_buffer(data, n);
}
}
static inline char
CPyStringWriter_Append(PyObject *obj, int32_t value) {
StringWriterObject *self = (StringWriterObject *)obj;
char kind = self->kind;
// Fast path: kind 1 (ASCII/Latin-1) with character < 256
if (kind == 1 && (uint32_t)value < 256) {
// Store length in local variable to enable additional optimizations
Py_ssize_t len = self->len;
if (!CPyStringWriter_EnsureSize(self, 1))
return CPY_NONE_ERROR;
self->buf[len] = (char)value;
self->len = len + 1;
return CPY_NONE;
}
// Slow path: handles kind switching and other cases
return LibRTStrings_string_append_slow_path(self, value);
}
// If index is negative, convert to non-negative index (no range checking)
static inline int64_t CPyStringWriter_AdjustIndex(PyObject *obj, int64_t index) {
if (index < 0) {
return index + ((StringWriterObject *)obj)->len;
}
return index;
}
static inline bool CPyStringWriter_RangeCheck(PyObject *obj, int64_t index) {
return index >= 0 && index < ((StringWriterObject *)obj)->len;
}
static inline int32_t CPyStringWriter_GetItem(PyObject *obj, int64_t index) {
StringWriterObject *self = (StringWriterObject *)obj;
char kind = self->kind;
char *buf = self->buf;
if (kind == 1) {
return (uint8_t)buf[index];
} else if (kind == 2) {
uint16_t val;
memcpy(&val, buf + index * 2, 2);
return (int32_t)val;
} else {
uint32_t val;
memcpy(&val, buf + index * 4, 4);
return (int32_t)val;
}
}
#endif // MYPYC_EXPERIMENTAL
#endif

View file

@ -0,0 +1,140 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <time.h>
#include <stdint.h>
#include "librt_time.h"
#include "pythoncapi_compat.h"
#include "mypyc_util.h"
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/time.h>
#endif
#ifdef MYPYC_EXPERIMENTAL
// Internal function that returns a C double for mypyc primitives
// Returns high-precision time in seconds (like time.time())
static double
time_time_internal(void) {
#ifdef _WIN32
// Windows: Use GetSystemTimePreciseAsFileTime for ~100ns precision
FILETIME ft;
ULARGE_INTEGER large;
GetSystemTimePreciseAsFileTime(&ft);
large.LowPart = ft.dwLowDateTime;
large.HighPart = ft.dwHighDateTime;
// Windows FILETIME is 100-nanosecond intervals since January 1, 1601
// 116444736000000000 = number of 100-ns intervals between 1601 and 1970
// Convert directly to seconds: 100ns * 1e-9 = 1e-7
int64_t intervals = large.QuadPart - 116444736000000000LL;
return (double)intervals * 1e-7;
#else // Unix-like systems (Linux, macOS, BSD, etc.)
// Try clock_gettime(CLOCK_REALTIME) for nanosecond precision
// This is available on POSIX.1-2001 and later (widely available on modern systems)
#if defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0
struct timespec ts;
if (clock_gettime(CLOCK_REALTIME, &ts) == 0) {
// Convert seconds and nanoseconds separately to avoid large integer operations
return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
}
// Fall through to gettimeofday if clock_gettime failed
#endif
// Fallback: gettimeofday for microsecond precision
// This is widely available (POSIX.1-2001, BSD, etc.)
struct timeval tv;
if (unlikely(gettimeofday(&tv, NULL) != 0)) {
PyErr_SetFromErrno(PyExc_OSError);
return CPY_FLOAT_ERROR;
}
// Convert seconds and microseconds separately to avoid large integer operations
return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
#endif
}
// Wrapper function for normal Python extension usage
static PyObject*
time_time(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 0) {
PyErr_SetString(PyExc_TypeError, "time() takes no arguments");
return NULL;
}
double result = time_time_internal();
if (result == CPY_FLOAT_ERROR) {
return NULL;
}
return PyFloat_FromDouble(result);
}
#endif
static PyMethodDef librt_time_module_methods[] = {
#ifdef MYPYC_EXPERIMENTAL
{"time", (PyCFunction)time_time, METH_FASTCALL,
PyDoc_STR("Return the current time in seconds since the Unix epoch as a floating point number.")},
#endif
{NULL, NULL, 0, NULL}
};
#ifdef MYPYC_EXPERIMENTAL
static int
time_abi_version(void) {
return LIBRT_TIME_ABI_VERSION;
}
static int
time_api_version(void) {
return LIBRT_TIME_API_VERSION;
}
#endif
static int
librt_time_module_exec(PyObject *m)
{
#ifdef MYPYC_EXPERIMENTAL
// Export mypyc internal C API via capsule
static void *time_api[LIBRT_TIME_API_LEN] = {
(void *)time_abi_version,
(void *)time_api_version,
(void *)time_time_internal,
};
PyObject *c_api_object = PyCapsule_New((void *)time_api, "librt.time._C_API", NULL);
if (PyModule_Add(m, "_C_API", c_api_object) < 0) {
return -1;
}
#endif
return 0;
}
static PyModuleDef_Slot librt_time_module_slots[] = {
{Py_mod_exec, librt_time_module_exec},
#ifdef Py_MOD_GIL_NOT_USED
{Py_mod_gil, Py_MOD_GIL_NOT_USED},
#endif
{0, NULL}
};
static PyModuleDef librt_time_module = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "time",
.m_doc = "Fast time() function optimized for mypyc",
.m_size = 0,
.m_methods = librt_time_module_methods,
.m_slots = librt_time_module_slots,
};
PyMODINIT_FUNC
PyInit_time(void)
{
return PyModuleDef_Init(&librt_time_module);
}

View file

@ -0,0 +1,62 @@
#ifndef LIBRT_TIME_H
#define LIBRT_TIME_H
#ifndef MYPYC_EXPERIMENTAL
static int
import_librt_time(void)
{
// All librt.time features are experimental for now, so don't set up the API here
return 0;
}
#else // MYPYC_EXPERIMENTAL
#include <Python.h>
#define LIBRT_TIME_ABI_VERSION 1
#define LIBRT_TIME_API_VERSION 1
#define LIBRT_TIME_API_LEN 3
static void *LibRTTime_API[LIBRT_TIME_API_LEN];
#define LibRTTime_ABIVersion (*(int (*)(void)) LibRTTime_API[0])
#define LibRTTime_APIVersion (*(int (*)(void)) LibRTTime_API[1])
#define LibRTTime_time (*(double (*)(void)) LibRTTime_API[2])
static int
import_librt_time(void)
{
PyObject *mod = PyImport_ImportModule("librt.time");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
void *capsule = PyCapsule_Import("librt.time._C_API", 0);
if (capsule == NULL)
return -1;
memcpy(LibRTTime_API, capsule, sizeof(LibRTTime_API));
if (LibRTTime_ABIVersion() != LIBRT_TIME_ABI_VERSION) {
char err[128];
snprintf(err, sizeof(err), "ABI version conflict for librt.time, expected %d, found %d",
LIBRT_TIME_ABI_VERSION,
LibRTTime_ABIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
if (LibRTTime_APIVersion() < LIBRT_TIME_API_VERSION) {
char err[128];
snprintf(err, sizeof(err),
"API version conflict for librt.time, expected %d or newer, found %d (hint: upgrade librt)",
LIBRT_TIME_API_VERSION,
LibRTTime_APIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
return 0;
}
#endif // MYPYC_EXPERIMENTAL
#endif // LIBRT_TIME_H

View file

@ -0,0 +1,62 @@
// Tuple primitive operations
//
// These are registered in mypyc.primitives.tuple_ops.
#include <Python.h>
#include "CPy.h"
PyObject *CPySequenceTuple_GetItem(PyObject *tuple, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyTuple_GET_SIZE(tuple);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "tuple index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "tuple index out of range");
return NULL;
}
}
PyObject *result = PyTuple_GET_ITEM(tuple, n);
Py_INCREF(result);
return result;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
PyObject *CPySequenceTuple_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (likely(PyTuple_CheckExact(obj)
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
if (startn < 0) {
startn += PyTuple_GET_SIZE(obj);
}
if (endn < 0) {
endn += PyTuple_GET_SIZE(obj);
}
return PyTuple_GetSlice(obj, startn, endn);
}
return CPyObject_GetSlice(obj, start, end);
}
// No error checking
PyObject *CPySequenceTuple_GetItemUnsafe(PyObject *tuple, Py_ssize_t index)
{
PyObject *result = PyTuple_GET_ITEM(tuple, index);
Py_INCREF(result);
return result;
}
// PyTuple_SET_ITEM does no error checking,
// and should only be used to fill in brand new tuples.
void CPySequenceTuple_SetItemUnsafe(PyObject *tuple, Py_ssize_t index, PyObject *value)
{
PyTuple_SET_ITEM(tuple, index, value);
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,873 @@
#ifndef VEC_H_INCL
#define VEC_H_INCL
// Header for the implementation of librt.vecs, which defines the 'vec' type.
// Refer to librt_vecs.c for more detailed information.
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdint.h>
#ifndef MYPYC_EXPERIMENTAL
static int
import_librt_vecs(void)
{
// All librt.vecs features are experimental for now, so don't set up the API here
return 0;
}
#else // MYPYC_EXPERIMENTAL
// Magic (native) integer return value on exception. Caller must also
// use PyErr_Occurred() since this overlaps with valid integer values.
#define MYPYC_INT_ERROR -113
// Item type constants for supported packed/specialized item types; must be
// even but not a multiple of 4 (2 + 4 * n). Each of these has a corresponding
// distinct implementation C extension class. For example, vec[i64] has a
// different runtime type than vec[i32]. All other item types use generic
// implementations.
#define VEC_ITEM_TYPE_I64 2
#define VEC_ITEM_TYPE_I32 6
#define VEC_ITEM_TYPE_I16 10
#define VEC_ITEM_TYPE_U8 14
#define VEC_ITEM_TYPE_FLOAT 18
#define VEC_ITEM_TYPE_BOOL 22
static inline size_t Vec_IsMagicItemType(size_t item_type) {
return item_type & 2;
}
// Buffer objects
// vecbuf[i64]
typedef struct _VecI64BufObject {
PyObject_VAR_HEAD
int64_t items[1];
} VecI64BufObject;
// vecbuf[i32]
typedef struct _VecI32BufObject {
PyObject_VAR_HEAD
int32_t items[1];
} VecI32BufObject;
// vecbuf[i16]
typedef struct _VecI16BufObject {
PyObject_VAR_HEAD
int16_t items[1];
} VecI16BufObject;
// vecbuf[u8]
typedef struct _VecU8BufObject {
PyObject_VAR_HEAD
uint8_t items[1];
} VecU8BufObject;
// vecbuf[float]
typedef struct _VecFloatBufObject {
PyObject_VAR_HEAD
double items[1];
} VecFloatBufObject;
// vecbuf[bool]
typedef struct _VecBoolBufObject {
PyObject_VAR_HEAD
char items[1];
} VecBoolBufObject;
// Simple generic vecbuf: vecbuf[t] when t is a type object
typedef struct _VecTBufObject {
PyObject_VAR_HEAD
// Tagged pointer to PyTypeObject *. The lowest bit is 1 for optional item type.
size_t item_type;
PyObject *items[1];
} VecTBufObject;
typedef struct _VecNestedBufItem {
Py_ssize_t len;
PyObject *buf;
} VecNestedBufItem;
// Nested vec type: vec[vec[...]], vec[vec[...] | None], etc.
typedef struct _VecNestedBufObject {
PyObject_VAR_HEAD
// Tagged pointer to PyTypeObject *. Lowest bit is set for optional item type.
// The second lowest bit is set for a packed item type (VEC_ITEM_TYPE_*).
size_t item_type;
// Number of nested vec types (of any kind, at least 1)
size_t depth;
VecNestedBufItem items[1];
} VecNestedBufObject;
// Unboxed vec objects
typedef struct _VecI64 {
Py_ssize_t len;
VecI64BufObject *buf;
} VecI64;
typedef struct _VecI32 {
Py_ssize_t len;
VecI32BufObject *buf;
} VecI32;
typedef struct _VecI16 {
Py_ssize_t len;
VecI16BufObject *buf;
} VecI16;
typedef struct _VecU8 {
Py_ssize_t len;
VecU8BufObject *buf;
} VecU8;
typedef struct _VecFloat {
Py_ssize_t len;
VecFloatBufObject *buf;
} VecFloat;
typedef struct _VecBool {
Py_ssize_t len;
VecBoolBufObject *buf;
} VecBool;
typedef struct _VecT {
Py_ssize_t len;
VecTBufObject *buf;
} VecT;
typedef struct _VecNested {
Py_ssize_t len;
VecNestedBufObject *buf;
} VecNested;
// Boxed vec objects
// Arbitrary boxed vec object (only shared bits)
typedef struct _VecObject {
PyObject_HEAD
Py_ssize_t len;
} VecObject;
// Base vec type object (for isinstance checks)
// This is an abstract base type that all specialized vec types inherit from.
// It cannot be instantiated directly - only used for isinstance(x, vec).
typedef struct _VecBaseObject {
PyObject_HEAD
} VecBaseObject;
// Boxed vec[i64]
typedef struct _VecI64Object {
PyObject_HEAD
VecI64 vec;
} VecI64Object;
// Boxed vec[i32]
typedef struct _VecI32Object {
PyObject_HEAD
VecI32 vec;
} VecI32Object;
// Boxed vec[i16]
typedef struct _VecI16Object {
PyObject_HEAD
VecI16 vec;
} VecI16Object;
// Boxed vec[u8]
typedef struct _VecU8Object {
PyObject_HEAD
VecU8 vec;
} VecU8Object;
// Boxed vec[float]
typedef struct _VecFloatObject {
PyObject_HEAD
VecFloat vec;
} VecFloatObject;
// Boxed vec[bool]
typedef struct _VecBoolObject {
PyObject_HEAD
VecBool vec;
} VecBoolObject;
// Simple boxed generic vecbuf: vecbuf[t] when t is a type object
typedef struct _VecTObject {
PyObject_HEAD
VecT vec;
} VecTObject;
// Extended generic vec type: vec[t | None], vec[vec[...]], etc.
typedef struct _VecNestedObject {
PyObject_HEAD
VecNested vec;
} VecNestedObject;
#ifndef MYPYC_DECLARED_tuple_T2V88
#define MYPYC_DECLARED_tuple_T2V88
typedef struct tuple_T2V88 {
VecI64 f0;
int64_t f1;
} tuple_T2V88;
static tuple_T2V88 tuple_undefined_T2V88 = { { -1, NULL } , 0 };
#endif
#ifndef MYPYC_DECLARED_tuple_T2V44
#define MYPYC_DECLARED_tuple_T2V44
typedef struct tuple_T2V44 {
VecI32 f0;
int32_t f1;
} tuple_T2V44;
static tuple_T2V44 tuple_undefined_T2V44 = { { -1, NULL } , 0 };
#endif
#ifndef MYPYC_DECLARED_tuple_T2V22
#define MYPYC_DECLARED_tuple_T2V22
typedef struct tuple_T2V22 {
VecI16 f0;
int16_t f1;
} tuple_T2V22;
static tuple_T2V22 tuple_undefined_T2V22 = { { -1, NULL } , 0 };
#endif
#ifndef MYPYC_DECLARED_tuple_T2VU1U1
#define MYPYC_DECLARED_tuple_T2VU1U1
typedef struct tuple_T2VU1U1 {
VecU8 f0;
uint8_t f1;
} tuple_T2VU1U1;
static tuple_T2VU1U1 tuple_undefined_T2VU1U1 = { { -1, NULL } , 0 };
#endif
#ifndef MYPYC_DECLARED_tuple_T2VFF
#define MYPYC_DECLARED_tuple_T2VFF
typedef struct tuple_T2VFF {
VecFloat f0;
double f1;
} tuple_T2VFF;
static tuple_T2VFF tuple_undefined_T2VFF = { { -1, NULL } , 0.0 };
#endif
#ifndef MYPYC_DECLARED_tuple_T2VCC
#define MYPYC_DECLARED_tuple_T2VCC
typedef struct tuple_T2VCC {
VecBool f0;
char f1;
} tuple_T2VCC;
static tuple_T2VCC tuple_undefined_T2VCC = { { -1, NULL } , 0 };
#endif
typedef tuple_T2V88 VecI64PopResult;
typedef tuple_T2V44 VecI32PopResult;
typedef tuple_T2V22 VecI16PopResult;
typedef tuple_T2VU1U1 VecU8PopResult;
typedef tuple_T2VFF VecFloatPopResult;
typedef tuple_T2VCC VecBoolPopResult;
// vec[i64] operations + type objects (stored in a capsule)
typedef struct _VecI64API {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecI64 (*alloc)(Py_ssize_t, Py_ssize_t);
PyObject *(*box)(VecI64);
VecI64 (*unbox)(PyObject *);
VecI64 (*convert_from_nested)(VecNestedBufItem);
VecI64 (*append)(VecI64, int64_t);
VecI64PopResult (*pop)(VecI64, Py_ssize_t);
VecI64 (*remove)(VecI64, int64_t);
// TODO: Py_ssize_t
VecI64 (*slice)(VecI64, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, int64_t);
// iter?
} VecI64API;
// vec[i32] operations + type objects (stored in a capsule)
typedef struct _VecI32API {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecI32 (*alloc)(Py_ssize_t, Py_ssize_t);
PyObject *(*box)(VecI32);
VecI32 (*unbox)(PyObject *);
VecI32 (*convert_from_nested)(VecNestedBufItem);
VecI32 (*append)(VecI32, int32_t);
VecI32PopResult (*pop)(VecI32, Py_ssize_t);
VecI32 (*remove)(VecI32, int32_t);
// TODO: Py_ssize_t
VecI32 (*slice)(VecI32, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, int32_t);
// iter?
} VecI32API;
// vec[i16] operations + type objects (stored in a capsule)
typedef struct _VecI16API {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecI16 (*alloc)(Py_ssize_t, Py_ssize_t);
PyObject *(*box)(VecI16);
VecI16 (*unbox)(PyObject *);
VecI16 (*convert_from_nested)(VecNestedBufItem);
VecI16 (*append)(VecI16, int16_t);
VecI16PopResult (*pop)(VecI16, Py_ssize_t);
VecI16 (*remove)(VecI16, int16_t);
// TODO: Py_ssize_t
VecI16 (*slice)(VecI16, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, int16_t);
// iter?
} VecI16API;
// vec[u8] operations + type objects (stored in a capsule)
typedef struct _VecU8API {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecU8 (*alloc)(Py_ssize_t, Py_ssize_t);
PyObject *(*box)(VecU8);
VecU8 (*unbox)(PyObject *);
VecU8 (*convert_from_nested)(VecNestedBufItem);
VecU8 (*append)(VecU8, uint8_t);
VecU8PopResult (*pop)(VecU8, Py_ssize_t);
VecU8 (*remove)(VecU8, uint8_t);
// TODO: Py_ssize_t
VecU8 (*slice)(VecU8, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, uint8_t);
// iter?
} VecU8API;
// vec[float] operations + type objects (stored in a capsule)
typedef struct _VecFloatAPI {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecFloat (*alloc)(Py_ssize_t, Py_ssize_t);
PyObject *(*box)(VecFloat);
VecFloat (*unbox)(PyObject *);
VecFloat (*convert_from_nested)(VecNestedBufItem);
VecFloat (*append)(VecFloat, double);
VecFloatPopResult (*pop)(VecFloat, Py_ssize_t);
VecFloat (*remove)(VecFloat, double);
// TODO: Py_ssize_t
VecFloat (*slice)(VecFloat, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, double);
// iter?
} VecFloatAPI;
// vec[bool] operations + type objects (stored in a capsule)
typedef struct _VecBoolAPI {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecBool (*alloc)(Py_ssize_t, Py_ssize_t);
PyObject *(*box)(VecBool);
VecBool (*unbox)(PyObject *);
VecBool (*convert_from_nested)(VecNestedBufItem);
VecBool (*append)(VecBool, char);
VecBoolPopResult (*pop)(VecBool, Py_ssize_t);
VecBool (*remove)(VecBool, char);
// TODO: Py_ssize_t
VecBool (*slice)(VecBool, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, char);
// iter?
} VecBoolAPI;
#ifndef MYPYC_DECLARED_tuple_T2VOO
#define MYPYC_DECLARED_tuple_T2VOO
typedef struct tuple_T2VOO {
VecT f0;
PyObject *f1;
} tuple_T2VOO;
static tuple_T2VOO tuple_undefined_T2VOO = { { -1, NULL } , NULL };
#endif
typedef tuple_T2VOO VecTPopResult;
// vec[T] operations + type objects (stored in a capsule)
//
// T is a class type or class type | None
typedef struct _VecTAPI {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecT (*alloc)(Py_ssize_t, Py_ssize_t, size_t);
PyObject *(*box)(VecT, size_t);
VecT (*unbox)(PyObject *, size_t);
VecT (*convert_from_nested)(VecNestedBufItem);
VecT (*append)(VecT, PyObject *, size_t);
VecTPopResult (*pop)(VecT, Py_ssize_t);
VecT (*remove)(VecT, PyObject *);
// TODO: Py_ssize_t
VecT (*slice)(VecT, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, PyObject *);
// iter?
} VecTAPI;
#ifndef MYPYC_DECLARED_tuple_T2VvVi
#define MYPYC_DECLARED_tuple_T2VvVi
typedef struct tuple_T2VvVi {
VecNested f0;
VecNestedBufItem f1;
} tuple_T2VvVi;
static tuple_T2VvVi tuple_undefined_T2VvVi = { { -1, NULL } , { -1, NULL } };
#endif
typedef tuple_T2VvVi VecNestedPopResult;
// Nested vec operations + type objects (stored in a capsule)
typedef struct _VecNestedAPI {
PyTypeObject *boxed_type;
PyTypeObject *buf_type;
VecNested (*alloc)(Py_ssize_t, Py_ssize_t, size_t, size_t depth);
PyObject *(*box)(VecNested);
VecNested (*unbox)(PyObject *, size_t, size_t depth);
VecNested (*convert_from_nested)(VecNestedBufItem);
VecNested (*append)(VecNested, VecNestedBufItem);
VecNestedPopResult (*pop)(VecNested, Py_ssize_t);
VecNested (*remove)(VecNested, VecNestedBufItem);
// TODO: Py_ssize_t
VecNested (*slice)(VecNested, int64_t, int64_t);
// PyObject *(*extend)(PyObject *, PyObject *);
// PyObject *(*concat)(PyObject *, PyObject *);
// bool (*contains)(PyObject *, PyObject *);
// iter?
} VecNestedAPI;
typedef struct {
VecTAPI *t;
VecNestedAPI *nested;
VecI64API *i64;
VecI32API *i32;
VecI16API *i16;
VecU8API *u8;
VecFloatAPI *float_;
VecBoolAPI *bool_;
PyTypeObject *(*get_vec_type)(void); // Function to get base VecType for isinstance checks
} VecCapsule;
#define VEC_BUF_SIZE(b) ((b)->ob_base.ob_size)
#define VEC_ITEM_TYPE(t) ((PyTypeObject *)((t) & ~1))
#define VEC_BUF_ITEM_TYPE(b) VEC_ITEM_TYPE((b)->item_type)
#define VEC_CAP(v) ((v).buf->ob_base.ob_size)
#define VEC_IS_ERROR(v) ((v).len < 0)
#define VEC_DECREF(v) Py_XDECREF((v).buf)
#define VEC_INCREF(v) Py_XINCREF((v).buf)
// Type objects
// Buffer type objects that store vec items
extern PyTypeObject VecI64BufType;
extern PyTypeObject VecI32BufType;
extern PyTypeObject VecI16BufType;
extern PyTypeObject VecU8BufType;
extern PyTypeObject VecFloatBufType;
extern PyTypeObject VecBoolBufType;
extern PyTypeObject VecTBufType;
extern PyTypeObject VecNestedBufType;
// Wrapper type objects for boxed vec values
extern PyTypeObject VecI64Type;
extern PyTypeObject VecI32Type;
extern PyTypeObject VecI16Type;
extern PyTypeObject VecU8Type;
extern PyTypeObject VecFloatType;
extern PyTypeObject VecBoolType;
extern PyTypeObject VecTType;
extern PyTypeObject VecNestedType;
// Type objects corresponding to the 'i64', 'i32', 'i16, and 'u8' types
extern PyTypeObject *LibRTVecs_I64TypeObj;
extern PyTypeObject *LibRTVecs_I32TypeObj;
extern PyTypeObject *LibRTVecs_I16TypeObj;
extern PyTypeObject *LibRTVecs_U8TypeObj;
extern VecI64API Vec_I64API;
extern VecI32API Vec_I32API;
extern VecI16API Vec_I16API;
extern VecU8API Vec_U8API;
extern VecFloatAPI Vec_FloatAPI;
extern VecBoolAPI Vec_BoolAPI;
extern VecTAPI Vec_TAPI;
extern VecNestedAPI Vec_NestedAPI;
static inline int Vec_CheckFloatError(PyObject *o) {
if (PyFloat_Check(o)) {
PyErr_SetString(PyExc_TypeError, "integer argument expected, got float");
return 1;
}
return 0;
}
// vec[i64] operations
static inline int VecI64_Check(PyObject *o) {
return o->ob_type == &VecI64Type;
}
static inline PyObject *VecI64_BoxItem(int64_t x) {
return PyLong_FromLongLong(x);
}
static inline int64_t VecI64_UnboxItem(PyObject *o) {
if (Vec_CheckFloatError(o))
return -1;
return PyLong_AsLongLong(o);
}
static inline int VecI64_IsUnboxError(int64_t x) {
return x == -1 && PyErr_Occurred();
}
PyObject *VecI64_Box(VecI64);
VecI64 VecI64_Append(VecI64, int64_t x);
VecI64 VecI64_Remove(VecI64, int64_t x);
VecI64PopResult VecI64_Pop(VecI64 v, Py_ssize_t index);
// vec[i32] operations
static inline int VecI32_Check(PyObject *o) {
return o->ob_type == &VecI32Type;
}
static inline PyObject *VecI32_BoxItem(int32_t x) {
return PyLong_FromLongLong(x);
}
static inline int32_t VecI32_UnboxItem(PyObject *o) {
if (Vec_CheckFloatError(o))
return -1;
long x = PyLong_AsLong(o);
if (x > INT32_MAX || x < INT32_MIN) {
PyErr_SetString(PyExc_OverflowError, "Python int too large to convert to i32");
return -1;
}
return x;
}
static inline int VecI32_IsUnboxError(int32_t x) {
return x == -1 && PyErr_Occurred();
}
PyObject *VecI32_Box(VecI32);
VecI32 VecI32_Append(VecI32, int32_t x);
VecI32 VecI32_Remove(VecI32, int32_t x);
VecI32PopResult VecI32_Pop(VecI32 v, Py_ssize_t index);
// vec[i16] operations
static inline int VecI16_Check(PyObject *o) {
return o->ob_type == &VecI16Type;
}
static inline PyObject *VecI16_BoxItem(int16_t x) {
return PyLong_FromLongLong(x);
}
static inline int16_t VecI16_UnboxItem(PyObject *o) {
if (Vec_CheckFloatError(o))
return -1;
long x = PyLong_AsLong(o);
if (x >= 32768 || x < -32768) {
PyErr_SetString(PyExc_OverflowError, "Python int too large to convert to i16");
return -1;
}
return x;
}
static inline int VecI16_IsUnboxError(int16_t x) {
return x == -1 && PyErr_Occurred();
}
PyObject *VecI16_Box(VecI16);
VecI16 VecI16_Append(VecI16, int16_t x);
VecI16 VecI16_Remove(VecI16, int16_t x);
VecI16PopResult VecI16_Pop(VecI16 v, Py_ssize_t index);
// vec[u8] operations
static inline int VecU8_Check(PyObject *o) {
return o->ob_type == &VecU8Type;
}
static inline PyObject *VecU8_BoxItem(uint8_t x) {
return PyLong_FromUnsignedLong(x);
}
static inline uint8_t VecU8_UnboxItem(PyObject *o) {
if (Vec_CheckFloatError(o))
return -1;
unsigned long x = PyLong_AsUnsignedLong(o);
if (x <= 255)
return x;
else if (x == (unsigned long)-1)
return 239;
else {
PyErr_SetString(PyExc_OverflowError, "Python int too large to convert to u8");
return 239;
}
}
static inline int VecU8_IsUnboxError(uint8_t x) {
return x == 239 && PyErr_Occurred();
}
PyObject *VecU8_Box(VecU8);
VecU8 VecU8_Append(VecU8, uint8_t x);
VecU8 VecU8_Remove(VecU8, uint8_t x);
VecU8PopResult VecU8_Pop(VecU8 v, Py_ssize_t index);
// vec[float] operations
static inline int VecFloat_Check(PyObject *o) {
return o->ob_type == &VecFloatType;
}
static inline PyObject *VecFloat_BoxItem(double x) {
return PyFloat_FromDouble(x);
}
static inline double VecFloat_UnboxItem(PyObject *o) {
return PyFloat_AsDouble(o);
}
static inline int VecFloat_IsUnboxError(double x) {
return x == -1.0 && PyErr_Occurred();
}
PyObject *VecFloat_Box(VecFloat);
VecFloat VecFloat_Append(VecFloat, double x);
VecFloat VecFloat_Remove(VecFloat, double x);
VecFloatPopResult VecFloat_Pop(VecFloat v, Py_ssize_t index);
// vec[bool] operations
static inline int VecBool_Check(PyObject *o) {
return o->ob_type == &VecBoolType;
}
static inline PyObject *VecBool_BoxItem(char x) {
if (x == 1) {
Py_INCREF(Py_True);
return Py_True;
} else {
Py_INCREF(Py_False);
return Py_False;
}
}
static inline char VecBool_UnboxItem(PyObject *o) {
if (o == Py_False) {
return 0;
} else if (o == Py_True) {
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "bool value expected");
return 2;
}
}
static inline int VecBool_IsUnboxError(char x) {
return x == 2;
}
PyObject *VecBool_Box(VecBool);
VecBool VecBool_Append(VecBool, char x);
VecBool VecBool_Remove(VecBool, char x);
VecBoolPopResult VecBool_Pop(VecBool v, Py_ssize_t index);
// vec[t] operations
static inline int VecT_Check(PyObject *o) {
return o->ob_type == &VecTType;
}
static inline int VecT_ItemCheck(VecT v, PyObject *item, size_t item_type) {
if (PyObject_TypeCheck(item, VEC_ITEM_TYPE(item_type))) {
return 1;
} else if ((item_type & 1) && item == Py_None) {
return 1;
} else {
// TODO: better error message
PyErr_SetString(PyExc_TypeError, "invalid item type");
return 0;
}
}
VecT VecT_New(Py_ssize_t size, Py_ssize_t cap, size_t item_type);
PyObject *VecT_FromIterable(size_t item_type, PyObject *iterable);
PyObject *VecT_Box(VecT vec, size_t item_type);
VecT VecT_Append(VecT vec, PyObject *x, size_t item_type);
VecT VecT_Remove(VecT vec, PyObject *x);
VecTPopResult VecT_Pop(VecT v, Py_ssize_t index);
// Nested vec operations
static inline int VecNested_Check(PyObject *o) {
return o->ob_type == &VecNestedType;
}
VecNested VecNested_New(Py_ssize_t size, Py_ssize_t cap, size_t item_type, size_t depth);
PyObject *VecNested_FromIterable(size_t item_type, size_t depth, PyObject *iterable);
PyObject *VecNested_Box(VecNested);
VecNested VecNested_Append(VecNested vec, VecNestedBufItem x);
VecNested VecNested_Remove(VecNested vec, VecNestedBufItem x);
VecNestedPopResult VecNested_Pop(VecNested v, Py_ssize_t index);
// Return 0 on success, -1 on error. Store unboxed item in *unboxed if successful.
// Return a *borrowed* reference.
static inline int VecNested_UnboxItem(VecNested v, PyObject *item, VecNestedBufItem *unboxed) {
size_t depth = v.buf->depth;
if (depth == 1) {
if (item->ob_type == &VecTType) {
VecNestedObject *o = (VecNestedObject *)item;
if (o->vec.buf->item_type == v.buf->item_type) {
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
}
} else if (item->ob_type == &VecI64Type && v.buf->item_type == VEC_ITEM_TYPE_I64) {
VecI64Object *o = (VecI64Object *)item;
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
} else if (item->ob_type == &VecU8Type && v.buf->item_type == VEC_ITEM_TYPE_U8) {
VecU8Object *o = (VecU8Object *)item;
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
} else if (item->ob_type == &VecFloatType && v.buf->item_type == VEC_ITEM_TYPE_FLOAT) {
VecFloatObject *o = (VecFloatObject *)item;
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
} else if (item->ob_type == &VecI32Type && v.buf->item_type == VEC_ITEM_TYPE_I32) {
VecI32Object *o = (VecI32Object *)item;
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
} else if (item->ob_type == &VecI16Type && v.buf->item_type == VEC_ITEM_TYPE_I16) {
VecI16Object *o = (VecI16Object *)item;
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
} else if (item->ob_type == &VecBoolType && v.buf->item_type == VEC_ITEM_TYPE_BOOL) {
VecBoolObject *o = (VecBoolObject *)item;
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
}
} else if (item->ob_type == &VecNestedType) {
VecNestedObject *o = (VecNestedObject *)item;
if (o->vec.buf->depth == v.buf->depth - 1
&& o->vec.buf->item_type == v.buf->item_type) {
unboxed->len = o->vec.len;
unboxed->buf = (PyObject *)o->vec.buf;
return 0;
}
}
// TODO: better error message
PyErr_SetString(PyExc_TypeError, "invalid item type");
return -1;
}
static inline PyObject *VecNested_BoxItem(VecNested v, VecNestedBufItem item) {
if (item.len < 0)
Py_RETURN_NONE;
Py_XINCREF(item.buf);
if (v.buf->depth > 1) {
// Item is a nested vec
VecNested v = { .len = item.len, .buf = (VecNestedBufObject *)item.buf };
return VecNested_Box(v);
} else {
// Item is a non-nested vec
size_t item_type = v.buf->item_type;
if (item_type == VEC_ITEM_TYPE_I64) {
VecI64 v = { .len = item.len, .buf = (VecI64BufObject *)item.buf };
return VecI64_Box(v);
} else if (item_type == VEC_ITEM_TYPE_U8) {
VecU8 v = { .len = item.len, .buf = (VecU8BufObject *)item.buf };
return VecU8_Box(v);
} else if (item_type == VEC_ITEM_TYPE_FLOAT) {
VecFloat v = { .len = item.len, .buf = (VecFloatBufObject *)item.buf };
return VecFloat_Box(v);
} else if (item_type == VEC_ITEM_TYPE_I32) {
VecI32 v = { .len = item.len, .buf = (VecI32BufObject *)item.buf };
return VecI32_Box(v);
} else if (item_type == VEC_ITEM_TYPE_I16) {
VecI16 v = { .len = item.len, .buf = (VecI16BufObject *)item.buf };
return VecI16_Box(v);
} else if (item_type == VEC_ITEM_TYPE_BOOL) {
VecBool v = { .len = item.len, .buf = (VecBoolBufObject *)item.buf };
return VecBool_Box(v);
} else {
// Generic vec[t]
VecT v = { .len = item.len, .buf = (VecTBufObject *)item.buf };
return VecT_Box(v, item_type);
}
}
}
// Misc helpers
PyObject *Vec_TypeToStr(size_t item_type, size_t depth);
PyObject *Vec_GenericRepr(PyObject *vec, size_t item_type, size_t depth, int verbose);
PyObject *Vec_GenericRichcompare(Py_ssize_t *len, PyObject **items,
Py_ssize_t *other_len, PyObject **other_items,
int op);
int Vec_GenericRemove(Py_ssize_t *len, PyObject **items, PyObject *item);
PyObject *Vec_GenericPopWrapper(Py_ssize_t *len, PyObject **items, PyObject *args);
PyObject *Vec_GenericPop(Py_ssize_t *len, PyObject **items, Py_ssize_t index);
// Global API pointers initialized by import_librt_vecs()
static VecCapsule *VecApi;
static VecI64API VecI64Api;
static VecI32API VecI32Api;
static VecI16API VecI16Api;
static VecU8API VecU8Api;
static VecFloatAPI VecFloatApi;
static VecBoolAPI VecBoolApi;
static VecTAPI VecTApi;
static VecNestedAPI VecNestedApi;
static int
import_librt_vecs(void)
{
PyObject *mod = PyImport_ImportModule("librt.vecs");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
VecApi = PyCapsule_Import("librt.vecs._C_API", 0);
if (!VecApi)
return -1;
VecI64Api = *VecApi->i64;
VecI32Api = *VecApi->i32;
VecI16Api = *VecApi->i16;
VecU8Api = *VecApi->u8;
VecFloatApi = *VecApi->float_;
VecBoolApi = *VecApi->bool_;
VecTApi = *VecApi->t;
VecNestedApi = *VecApi->nested;
return 0;
}
#endif // MYPYC_EXPERIMENTAL
#endif // VEC_H_INCL

View file

@ -0,0 +1,20 @@
#ifdef MYPYC_EXPERIMENTAL
#define VEC VecBool
#define VEC_TYPE VecBoolType
#define VEC_OBJECT VecBoolObject
#define BUF_OBJECT VecBoolBufObject
#define BUF_TYPE VecBoolBufType
#define NAME(suffix) VecBool##suffix
#define FUNC(suffix) VecBool_##suffix
#define ITEM_TYPE_STR "bool"
#define ITEM_TYPE_MAGIC VEC_ITEM_TYPE_BOOL
#define ITEM_C_TYPE char
#define FEATURES Vec_BoolAPI
#define BOX_ITEM VecBool_BoxItem
#define UNBOX_ITEM VecBool_UnboxItem
#define IS_UNBOX_ERROR VecBool_IsUnboxError
#include "vec_template.c"
#endif // MYPYC_EXPERIMENTAL

View file

@ -0,0 +1,20 @@
#ifdef MYPYC_EXPERIMENTAL
#define VEC VecFloat
#define VEC_TYPE VecFloatType
#define VEC_OBJECT VecFloatObject
#define BUF_OBJECT VecFloatBufObject
#define BUF_TYPE VecFloatBufType
#define NAME(suffix) VecFloat##suffix
#define FUNC(suffix) VecFloat_##suffix
#define ITEM_TYPE_STR "float"
#define ITEM_TYPE_MAGIC VEC_ITEM_TYPE_FLOAT
#define ITEM_C_TYPE double
#define FEATURES Vec_FloatAPI
#define BOX_ITEM VecFloat_BoxItem
#define UNBOX_ITEM VecFloat_UnboxItem
#define IS_UNBOX_ERROR VecFloat_IsUnboxError
#include "vec_template.c"
#endif // MYPYC_EXPERIMENTAL

View file

@ -0,0 +1,20 @@
#ifdef MYPYC_EXPERIMENTAL
#define VEC VecI16
#define VEC_TYPE VecI16Type
#define VEC_OBJECT VecI16Object
#define BUF_OBJECT VecI16BufObject
#define BUF_TYPE VecI16BufType
#define NAME(suffix) VecI16##suffix
#define FUNC(suffix) VecI16_##suffix
#define ITEM_TYPE_STR "i16"
#define ITEM_TYPE_MAGIC VEC_ITEM_TYPE_I16
#define ITEM_C_TYPE int16_t
#define FEATURES Vec_I16API
#define BOX_ITEM VecI16_BoxItem
#define UNBOX_ITEM VecI16_UnboxItem
#define IS_UNBOX_ERROR VecI16_IsUnboxError
#include "vec_template.c"
#endif // MYPYC_EXPERIMENTAL

View file

@ -0,0 +1,20 @@
#ifdef MYPYC_EXPERIMENTAL
#define VEC VecI32
#define VEC_TYPE VecI32Type
#define VEC_OBJECT VecI32Object
#define BUF_OBJECT VecI32BufObject
#define BUF_TYPE VecI32BufType
#define NAME(suffix) VecI32##suffix
#define FUNC(suffix) VecI32_##suffix
#define ITEM_TYPE_STR "i32"
#define ITEM_TYPE_MAGIC VEC_ITEM_TYPE_I32
#define ITEM_C_TYPE int32_t
#define FEATURES Vec_I32API
#define BOX_ITEM VecI32_BoxItem
#define UNBOX_ITEM VecI32_UnboxItem
#define IS_UNBOX_ERROR VecI32_IsUnboxError
#include "vec_template.c"
#endif // MYPYC_EXPERIMENTAL

Some files were not shown because too many files have changed in this diff Show more