From e3cfc0d33d04a33b6f546f8018991c34768a871d Mon Sep 17 00:00:00 2001 From: Paul Sokolovsky Date: Sat, 14 Jun 2014 06:06:36 +0300 Subject: [PATCH] objstr: Refactor to work with char pointers instead of indexes. In preparation for unicode support. --- py/objstr.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index f9cc273447..c84d7c900d 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -352,6 +352,12 @@ uncomparable: return MP_OBJ_NULL; // op not supported } +const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, + mp_obj_t index, bool is_slice) { + machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice); + return self_data + index_val; +} + STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { mp_obj_type_t *type = mp_obj_get_type(self_in); GET_STR_DATA_LEN(self_in, self_data, self_len); @@ -367,11 +373,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start); } #endif - uint index_val = mp_get_index(type, self_len, index, false); + const byte *p = str_index_to_ptr(type, self_data, self_len, index, false); if (type == &mp_type_bytes) { - return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]); + return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)*p); } else { - return mp_obj_new_str((char*)self_data + index_val, 1, true); + return mp_obj_new_str((char*)p, 1, true); } } else { return MP_OBJ_NULL; // op not supported @@ -567,6 +573,7 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) { STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); assert(2 <= n_args && n_args <= 4); assert(MP_OBJ_IS_STR(args[0])); assert(MP_OBJ_IS_STR(args[1])); @@ -574,16 +581,16 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire GET_STR_DATA_LEN(args[0], haystack, haystack_len); GET_STR_DATA_LEN(args[1], needle, needle_len); - machine_uint_t start = 0; - machine_uint_t end = haystack_len; + const byte *start = haystack; + const byte *end = haystack + haystack_len; if (n_args >= 3 && args[2] != mp_const_none) { - start = mp_get_index(&mp_type_str, haystack_len, args[2], true); + start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true); } if (n_args >= 4 && args[3] != mp_const_none) { - end = mp_get_index(&mp_type_str, haystack_len, args[3], true); + end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true); } - const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction); + const byte *p = find_subbytes(start, end - start, needle, needle_len, direction); if (p == NULL) { // not found if (is_index) { @@ -615,16 +622,17 @@ STATIC mp_obj_t str_rindex(uint n_args, const mp_obj_t *args) { // TODO: (Much) more variety in args STATIC mp_obj_t str_startswith(uint n_args, const mp_obj_t *args) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); GET_STR_DATA_LEN(args[0], str, str_len); GET_STR_DATA_LEN(args[1], prefix, prefix_len); - uint index_val = 0; + const byte *start = str; if (n_args > 2) { - index_val = mp_get_index(&mp_type_str, str_len, args[2], true); + start = str_index_to_ptr(self_type, str, str_len, args[2], true); } - if (prefix_len + index_val > str_len) { + if (prefix_len + (start - str) > str_len) { return mp_const_false; } - return MP_BOOL(memcmp(str + index_val, prefix, prefix_len) == 0); + return MP_BOOL(memcmp(start, prefix, prefix_len) == 0); } STATIC mp_obj_t str_endswith(uint n_args, const mp_obj_t *args) { @@ -1422,6 +1430,7 @@ STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) { } STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); assert(2 <= n_args && n_args <= 4); assert(MP_OBJ_IS_STR(args[0])); assert(MP_OBJ_IS_STR(args[1])); @@ -1429,26 +1438,28 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) { GET_STR_DATA_LEN(args[0], haystack, haystack_len); GET_STR_DATA_LEN(args[1], needle, needle_len); - machine_uint_t start = 0; - machine_uint_t end = haystack_len; + const byte *start = haystack; + const byte *end = haystack + haystack_len; if (n_args >= 3 && args[2] != mp_const_none) { - start = mp_get_index(&mp_type_str, haystack_len, args[2], true); + start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true); } if (n_args >= 4 && args[3] != mp_const_none) { - end = mp_get_index(&mp_type_str, haystack_len, args[3], true); + end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true); } // if needle_len is zero then we count each gap between characters as an occurrence if (needle_len == 0) { - return MP_OBJ_NEW_SMALL_INT(end - start + 1); + return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1); } // count the occurrences machine_int_t num_occurrences = 0; - for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) { - if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) { + for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) { + if (memcmp(haystack_ptr, needle, needle_len) == 0) { num_occurrences++; - haystack_index += needle_len - 1; + haystack_ptr += needle_len; + } else { + haystack_ptr = utf8_next_char(haystack_ptr); } }