python調用pybind11導出的pyd,出現UnicodeDecodeError
1. 問題描述
-
舉個例子,當有以下C++代碼以及Pybind11的綁定代碼時,在python訪問包含中文的Name和Value會有UnicodeDecodeError的異常!
class VxUserProp{public:VxUserProp();VxUserProp(const string &strName,const string &strVal);string m_strName;string m_strVal;};py::class_<VxUserProp>(m, "VxUserProp").def(py::init<>()).def(py::init<const std::string&, const std::string&>()).def_readwrite("Name", &VxUserProp::m_strName, "屬性名稱").def_readwrite("Value", &VxUserProp::m_strVal, "屬性值").def("__repr__", [](const VxUserProp& prop) {return "VxUserProp(Name='" + prop.m_strName + "', Value='" + prop.m_strVal + "')";});
2. 原因分析
- 經過查看官方文檔和分析,原因是我的C++代碼是gbk編碼,string中存的是gbk編碼的數據。當 C++函數返回
std::string
或char*
給 Python 調用者時,pybind11 會假定該字符串是有效的 UTF-8,并將其解碼為原生 Pythonstr
,使用與 Python 執行bytes.decode('utf-8')
相同的 API。如果這種隱式轉換失敗,pybind11 將引發UnicodeDecodeError
。
3. 解決方案
3.1 方案一:顯示轉換
-
根據官方文檔字符串、字節和 Unicode 轉換 - pybind11 文檔 — Strings, bytes and Unicode conversions - pybind11 documentation
-
如果某些 C++代碼構造的
std::string
不是 UTF-8 字符串,可以進行顯式轉換并返回一個py::str
對象。顯式轉換與隱式轉換具有相同的開銷。// This uses the Python C API to convert Latin-1 to Unicode m.def("str_output",[]() {std::string s = "Send your r\xe9sum\xe9 to Alice in HR"; // Latin-1py::handle py_s = PyUnicode_DecodeLatin1(s.data(), s.length(), nullptr);if (!py_s) {throw py::error_already_set();}return py::reinterpret_steal<py::str>(py_s);} );
-
顯而易見的是,這種寫法,工作量太大!
3.2 方案二:修改pybind11源碼
-
根據官方文檔概述 - pybind11 文檔 — Overview - pybind11 documentation中所介紹的,它內置了一些類型轉換。我們只需要找到這個轉換的代碼進行修改即可
-
最終我們在include/pybind11/cast.h中找到了string_caster,并進行了修改!具體修改參考以下代碼中的"load修改部分"和"cast修改部分"。進行以下修改后,其他地方不用進行任何修改! 對比上一種方法,工作量要小得多。
// Helper class for UTF-{8,16,32} C++ stl strings: template <typename StringType, bool IsView = false> struct string_caster {using CharT = typename StringType::value_type;// Simplify life by being able to assume standard char sizes (the standard only guarantees// minimums, but Python requires exact sizes)static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1,"Unsupported char size != 1"); #if defined(PYBIND11_HAS_U8STRING)static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1,"Unsupported char8_t size != 1"); #endifstatic_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2,"Unsupported char16_t size != 2");static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4,"Unsupported char32_t size != 4");// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,"Unsupported wchar_t size != 2/4");static constexpr size_t UTF_N = 8 * sizeof(CharT);bool load(handle src, bool) { #ifdef _WIN32 //----------------------load修改部分-----------------------------if constexpr (std::is_same<StringType, std::string>::value) {if (!src) return false;if (!PyUnicode_Check(src.ptr())) return false;// 總是用GBK編碼PyObject* gbk_bytes = PyUnicode_AsEncodedString(src.ptr(), "gbk", "replace");if (!gbk_bytes) return false;const char* gbk_data = PyBytes_AS_STRING(gbk_bytes);Py_ssize_t gbk_size = PyBytes_GET_SIZE(gbk_bytes);value.assign(gbk_data, gbk_size);Py_DECREF(gbk_bytes);return true;} #endif //-----------------------load修改部分結束----------------------------------// 其它類型/平臺走原邏輯handle load_src = src;if (!src) {return false;}if (!PyUnicode_Check(load_src.ptr())) {return load_raw(load_src);}// For UTF-8 we avoid the need for a temporary `bytes` object by using// `PyUnicode_AsUTF8AndSize`.if (UTF_N == 8) {Py_ssize_t size = -1;const auto *buffer= reinterpret_cast<const CharT *>(PyUnicode_AsUTF8AndSize(load_src.ptr(), &size));if (!buffer) {PyErr_Clear();return false;}value = StringType(buffer, static_cast<size_t>(size));return true;}auto utfNbytes= reinterpret_steal<object>(PyUnicode_AsEncodedString(load_src.ptr(),UTF_N == 8 ? "utf-8": UTF_N == 16 ? "utf-16": "utf-32",nullptr));if (!utfNbytes) {PyErr_Clear();return false;}const auto *buffer= reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);// Skip BOM for UTF-16/32if (UTF_N > 8) {buffer++;length--;}value = StringType(buffer, length);// If we're loading a string_view we need to keep the encoded Python object alive:if (IsView) {loader_life_support::add_patient(utfNbytes);}return true;}static handlecast(const StringType &src, return_value_policy /* policy */, handle /* parent */) { #ifdef _WIN32 //----------------------cast修改部分-----------------------------if constexpr (std::is_same<StringType, std::string>::value) {// 直接用GBK解碼PyObject* pystr = PyUnicode_Decode(src.data(), (Py_ssize_t)src.size(), "gbk", "replace");if (!pystr) return pybind11::none().release();return pybind11::handle(pystr);} #endif //----------------------cast修改部分-----------------------------// 其它類型/平臺走原邏輯const char *buffer = reinterpret_cast<const char *>(src.data());auto nbytes = ssize_t(src.size() * sizeof(CharT));handle s = decode_utfN(buffer, nbytes);if (!s) {throw error_already_set();}return s;}PYBIND11_TYPE_CASTER(StringType, const_name(PYBIND11_STRING_NAME));private:static handle decode_utfN(const char *buffer, ssize_t nbytes) { #if !defined(PYPY_VERSION)return UTF_N == 8 ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr): UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr): PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr); #else// PyPy segfaults when on PyUnicode_DecodeUTF16 (and possibly on PyUnicode_DecodeUTF32 as// well), so bypass the whole thing by just passing the encoding as a string value, which// works properly:return PyUnicode_Decode(buffer,nbytes,UTF_N == 8 ? "utf-8": UTF_N == 16 ? "utf-16": "utf-32",nullptr); #endif}// When loading into a std::string or char*, accept a bytes/bytearray object as-is (i.e.// without any encoding/decoding attempt). For other C++ char sizes this is a no-op.// which supports loading a unicode from a str, doesn't take this path.template <typename C = CharT>bool load_raw(enable_if_t<std::is_same<C, char>::value, handle> src) {if (PYBIND11_BYTES_CHECK(src.ptr())) {// We were passed raw bytes; accept it into a std::string or char*// without any encoding attempt.const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());if (!bytes) {pybind11_fail("Unexpected PYBIND11_BYTES_AS_STRING() failure.");}value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));return true;}if (PyByteArray_Check(src.ptr())) {// We were passed a bytearray; accept it into a std::string or char*// without any encoding attempt.const char *bytearray = PyByteArray_AsString(src.ptr());if (!bytearray) {pybind11_fail("Unexpected PyByteArray_AsString() failure.");}value = StringType(bytearray, (size_t) PyByteArray_Size(src.ptr()));return true;}return false;}template <typename C = CharT>bool load_raw(enable_if_t<!std::is_same<C, char>::value, handle>) {return false;} };