Make ord() Unicode-aware

This commit is contained in:
Chris Angelico 2014-06-07 11:25:06 +10:00
parent 03f0cbe905
commit bb13212071
3 changed files with 27 additions and 8 deletions

View File

@ -340,14 +340,23 @@ STATIC mp_obj_t mp_builtin_oct(mp_obj_t o_in) {
MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_oct_obj, mp_builtin_oct);
STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) {
uint len;
const char *str = mp_obj_str_get_data(o_in, &len);
if (len == 1) {
// don't sign extend when converting to ord
// TODO unicode
return mp_obj_new_int(((const byte*)str)[0]);
uint len, charlen;
const char *str = mp_obj_str_get_data_len(o_in, &len, &charlen);
if (charlen == 1) {
if (MP_OBJ_IS_STR(o_in) && (*str & 0x80)) {
machine_int_t ord = *str++ & 0x7F;
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
ord &= ~mask;
}
while ((*str & 0xC0) == 0x80) {
ord = (ord << 6) | (*str++ & 0x3F);
}
return mp_obj_new_int(ord);
} else {
return mp_obj_new_int(((const byte*)str)[0]);
}
} else {
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", len));
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", charlen));
}
}

View File

@ -468,6 +468,7 @@ uint mp_obj_str_get_len(mp_obj_t self_in);
qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway convert the string to a qstr
const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len);
const char *mp_obj_str_get_data_len(mp_obj_t self_in, uint *len, uint *charlen);
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len);
#if MICROPY_PY_BUILTINS_FLOAT

View File

@ -53,7 +53,6 @@ const mp_obj_t mp_const_empty_bytes;
#define GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_flags) const byte *str_data; uint str_len; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
// use this macro to extract the string data, lengths, and flags
// NOTE: Currently buggy as regards qstr, which doesn't record a charlen
#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen, str_flags) const byte *str_data; uint str_len, str_charlen; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); str_charlen = qstr_charlen(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
// don't use this macro, it's only for conversions
@ -1864,6 +1863,16 @@ const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
}
}
const char *mp_obj_str_get_data_len(mp_obj_t self_in, uint *len, uint *charlen) {
if (is_str_or_bytes(self_in)) {
GET_STR_INFO(self_in, s, l, cl, f);
*len = l; *charlen = cl;
return (const char*)s;
} else {
bad_implicit_conversion(self_in);
}
}
/******************************************************************************/
/* str iterator */