diff --git a/Doc/library/base64.rst b/Doc/library/base64.rst index 4876117f6403b2..55f20e1e8b4628 100644 --- a/Doc/library/base64.rst +++ b/Doc/library/base64.rst @@ -72,7 +72,7 @@ POST request. Added the *wrapcol* parameter. -.. function:: b64decode(s, altchars=None, validate=False) +.. function:: b64decode(s, altchars=None, validate=False, *, ignorechars=None) Decode the Base64 encoded :term:`bytes-like object` or ASCII string *s* and return the decoded :class:`bytes`. @@ -90,10 +90,19 @@ POST request. these non-alphabet characters in the input result in a :exc:`binascii.Error`. + Optional *ignorechars* must be a :term:`bytes-like object` specifying + characters to ignore during decoding. When provided, only characters in + this set will be silently ignored; other non-base64 characters will cause + a :exc:`binascii.Error`. When ``None`` (the default), the behavior is + controlled by the *validate* parameter. + For more information about the strict base64 check, see :func:`binascii.a2b_base64` May assert or raise a :exc:`ValueError` if the length of *altchars* is not 2. + .. versionchanged:: next + Added the *ignorechars* parameter. + .. function:: standard_b64encode(s) Encode :term:`bytes-like object` *s* using the standard Base64 alphabet diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index eaf755711bc292..f95e0735683954 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -48,7 +48,7 @@ The :mod:`binascii` module defines the following functions: Added the *backtick* parameter. -.. function:: a2b_base64(string, /, *, strict_mode=False) +.. function:: a2b_base64(string, /, *, strict_mode=False, ignorechars=None) Convert a block of base64 data back to binary and return the binary data. More than one line may be passed at a time. @@ -63,9 +63,18 @@ The :mod:`binascii` module defines the following functions: * Contains no excess data after padding (including excess padding, newlines, etc.). * Does not start with a padding. + Optional *ignorechars* must be a :term:`bytes-like object` specifying + characters to ignore during decoding. When provided, only characters in + this set will be silently ignored; other non-base64 characters will cause + an error. When ``None`` (the default), all non-base64 characters are + silently ignored (unless *strict_mode* is true). + .. versionchanged:: 3.11 Added the *strict_mode* parameter. + .. versionchanged:: next + Added the *ignorechars* parameter. + .. function:: b2a_base64(data, *, wrapcol=0, newline=True) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index b7a27d5db63875..9f4e092fa0f58c 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -444,6 +444,9 @@ base64 * Added the *wrapcol* parameter in :func:`~base64.b64encode`. (Contributed by Serhiy Storchaka in :gh:`143214`.) +* Added the *ignorechars* parameter in :func:`~base64.b64decode`. + (Contributed by Muneeb Ullah in :gh:`144001`.) + binascii -------- @@ -451,6 +454,9 @@ binascii * Added the *wrapcol* parameter in :func:`~binascii.b2a_base64`. (Contributed by Serhiy Storchaka in :gh:`143214`.) +* Added the *ignorechars* parameter in :func:`~binascii.a2b_base64`. + (Contributed by Muneeb Ullah in :gh:`144001`.) + calendar -------- diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 4a5b2a925413bf..fc297a2933a786 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -1797,6 +1797,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ident)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(identity_hint)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ignore)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ignorechars)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(imag)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(implieslink)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(importlib)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 7c2f44ef6dbe7a..563ccd7cf6d3f4 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -520,6 +520,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(ident) STRUCT_FOR_ID(identity_hint) STRUCT_FOR_ID(ignore) + STRUCT_FOR_ID(ignorechars) STRUCT_FOR_ID(imag) STRUCT_FOR_ID(implieslink) STRUCT_FOR_ID(importlib) diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index 6e7bad986dbeda..ba7c0e68434517 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1795,6 +1795,7 @@ extern "C" { INIT_ID(ident), \ INIT_ID(identity_hint), \ INIT_ID(ignore), \ + INIT_ID(ignorechars), \ INIT_ID(imag), \ INIT_ID(implieslink), \ INIT_ID(importlib), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index 660115931da0a0..44063794293990 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1860,6 +1860,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(ignorechars); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(imag); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); diff --git a/Lib/base64.py b/Lib/base64.py index e62ae6aff580fa..91d6806db3bbf1 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -62,7 +62,7 @@ def b64encode(s, altchars=None, *, wrapcol=0): return encoded -def b64decode(s, altchars=None, validate=False): +def b64decode(s, altchars=None, validate=False, *, ignorechars=None): """Decode the Base64 encoded bytes-like object or ASCII string s. Optional altchars must be a bytes-like object or ASCII string of length 2 @@ -79,13 +79,18 @@ def b64decode(s, altchars=None, validate=False): For more information about the strict base64 check, see: https://docs.python.org/3.11/library/binascii.html#binascii.a2b_base64 + + Optional ignorechars must be a bytes-like object specifying characters to + ignore during decoding. When provided, only characters in this set will be + silently ignored; other non-base64 characters will cause a binascii.Error. + When None (the default), the behavior is controlled by the validate parameter. """ s = _bytes_from_decode_data(s) if altchars is not None: altchars = _bytes_from_decode_data(altchars) assert len(altchars) == 2, repr(altchars) s = s.translate(bytes.maketrans(altchars, b'+/')) - return binascii.a2b_base64(s, strict_mode=validate) + return binascii.a2b_base64(s, strict_mode=validate, ignorechars=ignorechars) def standard_b64encode(s): diff --git a/Lib/test/test_base64.py b/Lib/test/test_base64.py index 120c5824a42a40..d1f5d953c5bac3 100644 --- a/Lib/test/test_base64.py +++ b/Lib/test/test_base64.py @@ -331,6 +331,47 @@ def test_b64decode_invalid_chars(self): self.assertEqual(base64.urlsafe_b64decode(b'++//'), res) self.assertEqual(base64.urlsafe_b64decode(b'--__'), res) + def test_b64decode_ignorechars(self): + # gh-144001: Test ignorechars parameter + eq = self.assertEqual + + # Basic functionality: ignore whitespace characters + eq(base64.b64decode(b'YWJj\n', ignorechars=b'\n'), b'abc') + eq(base64.b64decode(b'YWJj\r\n', ignorechars=b'\r\n'), b'abc') + eq(base64.b64decode(b'YWJj \t\n', ignorechars=b' \t\n'), b'abc') + + # Multiple whitespace characters in data + eq(base64.b64decode(b'YW Jj\nYW I=', ignorechars=b' \n'), b'abcab') + + # ignorechars=b'' should reject all non-base64 characters + with self.assertRaises(binascii.Error): + base64.b64decode(b'YWJj\n', ignorechars=b'') + with self.assertRaises(binascii.Error): + base64.b64decode(b'YWJj ', ignorechars=b'') + + # Characters not in ignorechars should raise error + with self.assertRaises(binascii.Error): + base64.b64decode(b'YWJj!', ignorechars=b'\n') + with self.assertRaises(binascii.Error): + base64.b64decode(b'YWJj@', ignorechars=b' \t\n') + + # ignorechars with custom characters + eq(base64.b64decode(b'YW|Jj', ignorechars=b'|'), b'abc') + eq(base64.b64decode(b'YW#Jj', ignorechars=b'#'), b'abc') + + # Valid base64 with ignorechars=None (default) should work + eq(base64.b64decode(b'YWJj\n', ignorechars=None), b'abc') + eq(base64.b64decode(b'YWJj!', ignorechars=None), b'abc') + + # Test with altchars and ignorechars together + eq(base64.b64decode(b'YW-j\n', altchars=b'-_', ignorechars=b'\n'), b'ao\xa3') + + # Test string input + eq(base64.b64decode('YWJj\n', ignorechars=b'\n'), b'abc') + + # Test that ignorechars accepts various bytes-like objects + eq(base64.b64decode(b'YWJj\n', ignorechars=bytearray(b'\n')), b'abc') + def _altchars_strategy(): """Generate 'altchars' for base64 encoding.""" reserved_chars = (string.digits + string.ascii_letters + "=").encode() diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 47e1e6ab035a17..8f1a9d034b9327 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -176,6 +176,45 @@ def assertExcessPadding(data, non_strict_mode_expected_result: bytes): assertExcessPadding(b'abcd====', b'i\xb7\x1d') assertExcessPadding(b'abcd=====', b'i\xb7\x1d') + def test_base64_ignorechars(self): + # gh-144001: Test ignorechars parameter for a2b_base64 + a2b = binascii.a2b_base64 + type2test = self.type2test + + # Basic functionality: ignore specified characters + self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=b'\n'), b'abc') + self.assertEqual(a2b(type2test(b'YWJj\r\n'), ignorechars=b'\r\n'), b'abc') + self.assertEqual(a2b(type2test(b'YWJj \t\n'), ignorechars=b' \t\n'), b'abc') + + # Multiple ignored characters in data + self.assertEqual(a2b(type2test(b'YW Jj\nYW I='), ignorechars=b' \n'), b'abcab') + + # ignorechars=b'' should reject all non-base64 characters + with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'): + a2b(type2test(b'YWJj\n'), ignorechars=b'') + with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'): + a2b(type2test(b'YWJj '), ignorechars=b'') + + # Characters not in ignorechars should raise error + with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'): + a2b(type2test(b'YWJj!'), ignorechars=b'\n') + with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'): + a2b(type2test(b'YWJj@'), ignorechars=b' \t\n') + + # ignorechars with custom characters + self.assertEqual(a2b(type2test(b'YW|Jj'), ignorechars=b'|'), b'abc') + self.assertEqual(a2b(type2test(b'YW#Jj'), ignorechars=b'#'), b'abc') + + # ignorechars=None should use default behavior (ignore all non-base64) + self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=None), b'abc') + self.assertEqual(a2b(type2test(b'YWJj!'), ignorechars=None), b'abc') + + # Test interaction with strict_mode + # When both are used, ignorechars takes precedence for character filtering + self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=b'\n', strict_mode=False), b'abc') + + # Test that ignorechars accepts various bytes-like objects + self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=bytearray(b'\n')), b'abc') def test_base64errors(self): # Test base64 with invalid padding diff --git a/Misc/NEWS.d/next/Library/2026-01-18-17-29-29.gh-issue-144001.uWGYjV.rst b/Misc/NEWS.d/next/Library/2026-01-18-17-29-29.gh-issue-144001.uWGYjV.rst new file mode 100644 index 00000000000000..394314635da5c9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-01-18-17-29-29.gh-issue-144001.uWGYjV.rst @@ -0,0 +1,6 @@ +Add the ``ignorechars`` parameter to :func:`binascii.a2b_base64` and +:func:`base64.b64decode`. When provided, only characters in this set will be +silently ignored during decoding; other non-base64 characters will cause an +error. This allows selective filtering of characters (e.g., ignoring +whitespace while rejecting other invalid characters), similar to the existing +``ignorechars`` parameter in :func:`base64.a85decode`. diff --git a/Modules/binascii.c b/Modules/binascii.c index c569d3187f2e67..8df7bced6ce669 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -477,17 +477,24 @@ binascii.a2b_base64 / * strict_mode: bool = False + ignorechars: object = None Decode a line of base64 data. strict_mode When set to True, bytes that are not part of the base64 standard are not allowed. The same applies to excess data after padding (= / ==). + ignorechars + A bytes-like object specifying characters to ignore during decoding. + When provided, only characters in this set will be silently ignored; + other non-base64 characters will cause an error. When None (the default), + all non-base64 characters are silently ignored (unless strict_mode is True). [clinic start generated code]*/ static PyObject * -binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) -/*[clinic end generated code: output=5409557788d4f975 input=13c797187acc9c40]*/ +binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode, + PyObject *ignorechars) +/*[clinic end generated code: output=7d2b92b6f1de3ccc input=485946ff2e8960c6]*/ { assert(data->len >= 0); @@ -496,10 +503,30 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) binascii_state *state = NULL; char padding_started = 0; + /* Handle ignorechars parameter */ + Py_buffer ignorechars_buf = {0}; + int has_ignorechars = 0; + unsigned char ignorechars_table[256] = {0}; /* Lookup table for ignored chars */ + + if (ignorechars != Py_None) { + if (PyObject_GetBuffer(ignorechars, &ignorechars_buf, PyBUF_SIMPLE) < 0) { + return NULL; + } + has_ignorechars = 1; + /* Build lookup table for O(1) character checking */ + const unsigned char *ic = (const unsigned char *)ignorechars_buf.buf; + for (Py_ssize_t j = 0; j < ignorechars_buf.len; j++) { + ignorechars_table[ic[j]] = 1; + } + } + /* Allocate the buffer */ Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */ PyBytesWriter *writer = PyBytesWriter_Create(bin_len); if (writer == NULL) { + if (has_ignorechars) { + PyBuffer_Release(&ignorechars_buf); + } return NULL; } unsigned char *bin_data = PyBytesWriter_GetData(writer); @@ -517,8 +544,9 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) /* Fast path: use optimized decoder for complete quads. * This works for both strict and non-strict mode for valid input. * The fast path stops at padding, invalid chars, or incomplete groups. + * Skip fast path when ignorechars is provided, as we need to check each char. */ - if (ascii_len >= 4) { + if (ascii_len >= 4 && !has_ignorechars) { Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len, bin_data, table_a2b_base64); if (fast_chars > 0) { @@ -533,6 +561,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) int pads = 0; for (; i < ascii_len; i++) { unsigned char this_ch = ascii_data[i]; + unsigned char orig_ch = this_ch; /* Save original for ignorechars check */ /* Check for pad sequences and ignore ** the invalid ones. @@ -567,7 +596,20 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) this_ch = table_a2b_base64[this_ch]; if (this_ch >= 64) { - if (strict_mode) { + /* Non-base64 character found */ + if (has_ignorechars) { + /* When ignorechars is provided, only skip if char is in the set */ + if (ignorechars_table[orig_ch]) { + continue; /* Character is in ignorechars, skip it */ + } + /* Character not in ignorechars, raise error */ + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, "Only base64 data is allowed"); + } + goto error_end; + } + else if (strict_mode) { state = get_binascii_state(module); if (state) { PyErr_SetString(state->Error, "Only base64 data is allowed"); @@ -634,9 +676,15 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) } done: + if (has_ignorechars) { + PyBuffer_Release(&ignorechars_buf); + } return PyBytesWriter_FinishWithPointer(writer, bin_data); error_end: + if (has_ignorechars) { + PyBuffer_Release(&ignorechars_buf); + } PyBytesWriter_Discard(writer); return NULL; } diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 524f5fc93d0c21..a5dffa9350e897 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -116,20 +116,26 @@ binascii_b2a_uu(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj } PyDoc_STRVAR(binascii_a2b_base64__doc__, -"a2b_base64($module, data, /, *, strict_mode=False)\n" +"a2b_base64($module, data, /, *, strict_mode=False, ignorechars=None)\n" "--\n" "\n" "Decode a line of base64 data.\n" "\n" " strict_mode\n" " When set to True, bytes that are not part of the base64 standard are not allowed.\n" -" The same applies to excess data after padding (= / ==)."); +" The same applies to excess data after padding (= / ==).\n" +" ignorechars\n" +" A bytes-like object specifying characters to ignore during decoding.\n" +" When provided, only characters in this set will be silently ignored;\n" +" other non-base64 characters will cause an error. When None (the default),\n" +" all non-base64 characters are silently ignored (unless strict_mode is True)."); #define BINASCII_A2B_BASE64_METHODDEF \ {"a2b_base64", _PyCFunction_CAST(binascii_a2b_base64), METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base64__doc__}, static PyObject * -binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode); +binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode, + PyObject *ignorechars); static PyObject * binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) @@ -137,7 +143,7 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 1 + #define NUM_KEYWORDS 2 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD @@ -146,7 +152,7 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P } _kwtuple = { .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) .ob_hash = -1, - .ob_item = { &_Py_ID(strict_mode), }, + .ob_item = { &_Py_ID(strict_mode), &_Py_ID(ignorechars), }, }; #undef NUM_KEYWORDS #define KWTUPLE (&_kwtuple.ob_base.ob_base) @@ -155,17 +161,18 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P # define KWTUPLE NULL #endif // !Py_BUILD_CORE - static const char * const _keywords[] = {"", "strict_mode", NULL}; + static const char * const _keywords[] = {"", "strict_mode", "ignorechars", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, .fname = "a2b_base64", .kwtuple = KWTUPLE, }; #undef KWTUPLE - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; Py_buffer data = {NULL, NULL}; int strict_mode = 0; + PyObject *ignorechars = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); @@ -178,12 +185,18 @@ binascii_a2b_base64(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P if (!noptargs) { goto skip_optional_kwonly; } - strict_mode = PyObject_IsTrue(args[1]); - if (strict_mode < 0) { - goto exit; + if (args[1]) { + strict_mode = PyObject_IsTrue(args[1]); + if (strict_mode < 0) { + goto exit; + } + if (!--noptargs) { + goto skip_optional_kwonly; + } } + ignorechars = args[2]; skip_optional_kwonly: - return_value = binascii_a2b_base64_impl(module, &data, strict_mode); + return_value = binascii_a2b_base64_impl(module, &data, strict_mode, ignorechars); exit: /* Cleanup for data */ @@ -823,4 +836,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=644ccdc8e0d56e65 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=a1b6612dfe6454f5 input=a9049054013a1b77]*/