Improve parse_identifier (#4691)

Ascii string length is no longer computed during string allocation. JerryScript-DCO-1.0-Signed-off-by: Daniel Batiz [email protected]
jerryscript-project · Aug 17, 2021 · 3bcd48f · 3bcd48f
1 parent e7ffb70
commit 3bcd48f
Show file tree

Hide file tree

Showing 19 changed files with 172 additions and 92 deletions.
diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c
@@ -335,6 +335,33 @@ ecma_find_special_string (const lit_utf8_byte_t *string_p, /**< utf8 string */
   return NULL;
 } /* ecma_find_special_string */
 
+/**
+ * Allocate new ecma-string and fill it with characters from ascii characters
+ *
+ * @return pointer to ecma-string descriptor
+ */
+ecma_string_t *
+ecma_new_ecma_string_from_ascii (const lit_utf8_byte_t *string_p, /**< ascii string */
+                                 lit_utf8_size_t string_size) /**< string size */
+{
+  JERRY_ASSERT (string_p != NULL || string_size == 0);
+
+  ecma_string_t *string_desc_p = ecma_find_special_string (string_p, string_size);
+
+  if (string_desc_p != NULL)
+  {
+    return string_desc_p;
+  }
+
+  lit_utf8_byte_t *data_p;
+  string_desc_p = ecma_new_ecma_string_from_utf8_buffer (string_size, string_size, &data_p);
+
+  string_desc_p->u.hash = lit_utf8_string_calc_hash (string_p, string_size);
+  memcpy (data_p, string_p, string_size);
+
+  return string_desc_p;
+} /* ecma_new_ecma_string_from_ascii */
+
 /**
  * Allocate new ecma-string and fill it with characters from the utf8 string
  *
@@ -2449,8 +2476,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
 
   if (string_length == buffer_size)
   {
-    ecma_string_p = ecma_new_ecma_string_from_utf8 (start_p + start_pos,
-                                                    (lit_utf8_size_t) end_pos);
+    ecma_string_p = ecma_new_ecma_string_from_utf8 (start_p + start_pos, (lit_utf8_size_t) end_pos);
   }
   else
   {

diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h
@@ -299,7 +299,10 @@ ecma_length_t ecma_op_advance_string_index (ecma_string_t *str_p, ecma_length_t
 ecma_string_t *ecma_new_map_key_string (ecma_value_t value);
 bool ecma_prop_name_is_map_key (ecma_string_t *string_p);
 #endif /* JERRY_BUILTIN_CONTAINER */
-ecma_string_t *ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, lit_utf8_size_t string_size);
+ecma_string_t *ecma_new_ecma_string_from_ascii (const lit_utf8_byte_t *string_p,
+                                                lit_utf8_size_t string_size);
+ecma_string_t *ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p,
+                                               lit_utf8_size_t string_size);
 ecma_string_t *ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *string_p,
                                                                   lit_utf8_size_t string_size);
 ecma_string_t *ecma_new_ecma_external_string_from_cesu8 (const lit_utf8_byte_t *string_p, lit_utf8_size_t string_size,

diff --git a/jerry-core/ecma/base/ecma-literal-storage.c b/jerry-core/ecma/base/ecma-literal-storage.c
@@ -165,9 +165,11 @@ ecma_finalize_lit_storage (void)
  */
 ecma_value_t
 ecma_find_or_create_literal_string (const lit_utf8_byte_t *chars_p, /**< string to be searched */
-                                    lit_utf8_size_t size) /**< size of the string */
+                                    lit_utf8_size_t size, /**< size of the string */
+                                    bool is_ascii) /**< encode of the string */
 {
-  ecma_string_t *string_p = ecma_new_ecma_string_from_utf8 (chars_p, size);
+  ecma_string_t *string_p = (is_ascii ? ecma_new_ecma_string_from_ascii (chars_p, size)
+                                      : ecma_new_ecma_string_from_utf8 (chars_p, size));
 
   if (ECMA_IS_DIRECT_STRING (string_p))
   {
@@ -702,7 +704,7 @@ ecma_snapshot_get_literal (const uint8_t *literal_base_p, /**< literal start */
 
   uint16_t length = *(const uint16_t *) literal_p;
 
-  return ecma_find_or_create_literal_string (literal_p + sizeof (uint16_t), length);
+  return ecma_find_or_create_literal_string (literal_p + sizeof (uint16_t), length, false);
 } /* ecma_snapshot_get_literal */
 
 /**

diff --git a/jerry-core/ecma/base/ecma-literal-storage.h b/jerry-core/ecma/base/ecma-literal-storage.h
@@ -40,7 +40,7 @@ typedef struct
 
 void ecma_finalize_lit_storage (void);
 
-ecma_value_t ecma_find_or_create_literal_string (const lit_utf8_byte_t *chars_p, lit_utf8_size_t size);
+ecma_value_t ecma_find_or_create_literal_string (const lit_utf8_byte_t *chars_p, lit_utf8_size_t size, bool is_ascii);
 ecma_value_t ecma_find_or_create_literal_number (ecma_number_t number_arg);
 #if JERRY_BUILTIN_BIGINT
 ecma_value_t ecma_find_or_create_literal_bigint (ecma_value_t bigint);

diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-date.c b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-date.c
@@ -680,7 +680,7 @@ ecma_date_to_string_format (ecma_number_t datetime_number, /**< datetime */
 
   JERRY_ASSERT (dest_p <= date_buffer + date_buffer_length);
 
-  return ecma_make_string_value (ecma_new_ecma_string_from_utf8 (date_buffer,
+  return ecma_make_string_value (ecma_new_ecma_string_from_ascii (date_buffer,
                                                                  (lit_utf8_size_t) (dest_p - date_buffer)));
 } /* ecma_date_to_string_format */
 

diff --git a/jerry-core/ecma/operations/ecma-bigint.c b/jerry-core/ecma/operations/ecma-bigint.c
@@ -250,7 +250,7 @@ ecma_bigint_to_string (ecma_value_t value, /**< BigInt value */
   }
 
   ecma_string_t *string_p;
-  string_p = ecma_new_ecma_string_from_utf8 (string_buffer_p + char_start_p, char_size_p - char_start_p);
+  string_p = ecma_new_ecma_string_from_ascii (string_buffer_p + char_start_p, char_size_p - char_start_p);
 
   jmem_heap_free_block (string_buffer_p, char_size_p);
   return string_p;

diff --git a/jerry-core/parser/js/common.h b/jerry-core/parser/js/common.h
@@ -74,8 +74,9 @@ typedef enum
   LEXER_FLAG_SOURCE_PTR = (1 << 2), /**< the literal is directly referenced in the source code
                                      *   (no need to allocate memory) */
   LEXER_FLAG_LATE_INIT = (1 << 3), /**< initialize this variable after the byte code is freed */
+  LEXER_FLAG_ASCII = (1 << 4), /**< the literal contains only ascii characters */
 #if JERRY_ESNEXT
-  LEXER_FLAG_GLOBAL = (1 << 4), /**< this local identifier is not a let or const declaration */
+  LEXER_FLAG_GLOBAL = (1 << 5), /**< this local identifier is not a let or const declaration */
 #endif /* JERRY_ESNEXT */
 } lexer_literal_status_flags_t;
 

diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c
@@ -642,7 +642,7 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
   parser_line_counter_t column = context_p->column;
   const uint8_t *source_end_p = context_p->source_end_p;
   size_t length = 0;
-  uint8_t has_escape = false;
+  lexer_lit_location_flags_t status_flags = LEXER_LIT_LOCATION_IS_ASCII;
 
   do
   {
@@ -657,7 +657,7 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
         return true;
       }
 
-      has_escape = true;
+      status_flags = LEXER_LIT_LOCATION_HAS_ESCAPE;
 
 #if JERRY_ESNEXT
       if (source_p + 5 <= source_end_p && source_p[1] == LIT_CHAR_LOWERCASE_U)
@@ -711,6 +711,8 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
 
     if (JERRY_UNLIKELY (code_point >= LIT_UTF8_2_BYTE_MARKER))
     {
+      status_flags &= (uint32_t) ~LEXER_LIT_LOCATION_IS_ASCII;
+
 #if JERRY_ESNEXT
       utf8_length = lit_read_code_point_from_utf8 (source_p,
                                                    (lit_utf8_size_t) (source_end_p - source_p),
@@ -738,7 +740,7 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
       else if (source_p[0] >= LIT_UTF8_4_BYTE_MARKER)
       {
         decoded_length = 2 * 3;
-        has_escape = true;
+        status_flags = LEXER_LIT_LOCATION_HAS_ESCAPE;
       }
 #else /* !JERRY_ESNEXT */
       if (code_point < LIT_UTF8_4_BYTE_MARKER)
@@ -789,7 +791,7 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
 
   context_p->token.type = LEXER_LITERAL;
   context_p->token.lit_location.type = LEXER_IDENT_LITERAL;
-  context_p->token.lit_location.has_escape = has_escape;
+  context_p->token.lit_location.status_flags = (uint8_t) status_flags;
 
   context_p->token.column = context_p->column;
   context_p->token.lit_location.char_p = context_p->source_p;
@@ -807,7 +809,7 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */
     const uint8_t *ident_start_p = context_p->source_p;
     uint8_t buffer_p[LEXER_KEYWORD_MAX_LENGTH];
 
-    if (JERRY_UNLIKELY (context_p->token.lit_location.has_escape))
+    if (JERRY_UNLIKELY (context_p->token.lit_location.status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE))
     {
       lexer_convert_ident_to_cesu8 (buffer_p, ident_start_p, (prop_length_t) length);
       ident_start_p = buffer_p;
@@ -953,7 +955,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */
   parser_line_counter_t original_line = line;
   parser_line_counter_t original_column = column;
   size_t length = 0;
-  uint8_t has_escape = false;
+  lexer_lit_location_flags_t status_flags = LEXER_LIT_LOCATION_IS_ASCII;
 
 #if JERRY_ESNEXT
   if (str_end_character == LIT_CHAR_RIGHT_BRACE)
@@ -986,7 +988,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */
         continue;
       }
 
-      has_escape = true;
+      status_flags = LEXER_LIT_LOCATION_HAS_ESCAPE;
 
       /* Newline is ignored. */
       if (*source_p == LIT_CHAR_CR)
@@ -1163,7 +1165,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */
        * after a backslash). Always converted to two 3 byte
        * long sequence. */
       length += 2 * 3;
-      has_escape = true;
+      status_flags = LEXER_LIT_LOCATION_HAS_ESCAPE;
       source_p += 4;
 #if JERRY_ESNEXT
       raw_length_adjust += 2;
@@ -1192,7 +1194,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */
          Note: ECMAScript v6, 11.8.6.1 <CR> or <CR><LF> are both normalized to <LF> */
       if (*source_p == LIT_CHAR_CR)
       {
-        has_escape = true;
+        status_flags = LEXER_LIT_LOCATION_HAS_ESCAPE;
         source_p++;
         length++;
         if (source_p < source_end_p
@@ -1261,7 +1263,7 @@ lexer_parse_string (parser_context_t *context_p, /**< context */
   context_p->token.lit_location.char_p = string_start_p;
   context_p->token.lit_location.length = (prop_length_t) length;
   context_p->token.lit_location.type = LEXER_STRING_LITERAL;
-  context_p->token.lit_location.has_escape = has_escape;
+  context_p->token.lit_location.status_flags = (uint8_t) status_flags;
 
   context_p->source_p = source_p + 1;
   context_p->line = line;
@@ -1328,7 +1330,7 @@ lexer_parse_number (parser_context_t *context_p) /**< context */
   context_p->token.extra_value = LEXER_NUMBER_DECIMAL;
   context_p->token.lit_location.char_p = source_p;
   context_p->token.lit_location.type = LEXER_NUMBER_LITERAL;
-  context_p->token.lit_location.has_escape = false;
+  context_p->token.lit_location.status_flags = LEXER_LIT_LOCATION_IS_ASCII;
 
   if (source_p[0] == LIT_CHAR_0
       && source_p + 1 < source_end_p)
@@ -2240,7 +2242,7 @@ lexer_convert_literal_to_chars (parser_context_t *context_p, /**< context */
 {
   JERRY_ASSERT (context_p->u.allocated_buffer_p == NULL);
 
-  if (!literal_p->has_escape)
+  if (!(literal_p->status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE))
   {
     return literal_p->char_p;
   }
@@ -2601,6 +2603,11 @@ lexer_construct_literal_object (parser_context_t *context_p, /**< context */
     status_flags |= LEXER_FLAG_USED;
   }
 
+  if (lit_location_p->status_flags & LEXER_LIT_LOCATION_IS_ASCII)
+  {
+    literal_p->status_flags |= LEXER_FLAG_ASCII;
+  }
+
   literal_p->status_flags = status_flags;
 
   context_p->lit_object.literal_p = literal_p;
@@ -3490,7 +3497,7 @@ lexer_compare_identifier_to_string (const lexer_lit_location_t *left_p, /**< lef
     return false;
   }
 
-  if (!left_p->has_escape)
+  if (!(left_p->status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE))
   {
     return memcmp (left_p->char_p, right_p, size) == 0;
   }
@@ -3518,12 +3525,12 @@ lexer_compare_identifiers (parser_context_t *context_p, /**< context */
     return false;
   }
 
-  if (!left_p->has_escape)
+  if (!(left_p->status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE))
   {
     return lexer_compare_identifier_to_chars (right_p->char_p, left_p->char_p, length);
   }
 
-  if (!right_p->has_escape)
+  if (!(right_p->status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE))
   {
     return lexer_compare_identifier_to_chars (left_p->char_p, right_p->char_p, length);
   }
@@ -3568,7 +3575,7 @@ lexer_current_is_literal (parser_context_t *context_p, /**< context */
     return false;
   }
 
-  if (!left_ident_p->has_escape && !right_ident_p->has_escape)
+  if (!((left_ident_p->status_flags | right_ident_p->status_flags) & LEXER_LIT_LOCATION_HAS_ESCAPE))
   {
     return memcmp (left_ident_p->char_p, right_ident_p->char_p, left_ident_p->length) == 0;
   }
@@ -3591,7 +3598,7 @@ lexer_string_is_use_strict (parser_context_t *context_p) /**< context */
                 && context_p->token.lit_location.type == LEXER_STRING_LITERAL);
 
   return (context_p->token.lit_location.length == 10
-          && !context_p->token.lit_location.has_escape
+          && !(context_p->token.lit_location.status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE)
           && memcmp (context_p->token.lit_location.char_p, "use strict", 10) == 0);
 } /* lexer_string_is_use_strict */
 
@@ -3649,7 +3656,7 @@ lexer_token_is_let (parser_context_t *context_p) /**< context */
   JERRY_ASSERT (context_p->token.type == LEXER_LITERAL);
 
   return (context_p->token.keyword_type == LEXER_KEYW_LET
-          && !context_p->token.lit_location.has_escape);
+          && !(context_p->token.lit_location.status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE));
 } /* lexer_token_is_let */
 
 /**
@@ -3667,7 +3674,7 @@ lexer_token_is_async (parser_context_t *context_p) /**< context */
                 || context_p->token.type == LEXER_TEMPLATE_LITERAL);
 
   return (context_p->token.keyword_type == LEXER_KEYW_ASYNC
-          && !context_p->token.lit_location.has_escape);
+          && !(context_p->token.lit_location.status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE));
 } /* lexer_token_is_async */
 
 #endif /* JERRY_ESNEXT */

diff --git a/jerry-core/parser/js/js-lexer.h b/jerry-core/parser/js/js-lexer.h
@@ -305,6 +305,16 @@ typedef enum
 #endif /* JERRY_BUILTIN_BIGINT */
 } lexer_number_type_t;
 
+/**
+ * Lexer literal flags.
+ **/
+typedef enum
+{
+  LEXER_LIT_LOCATION_NO_OPTS = 0,           /**< no options */
+  LEXER_LIT_LOCATION_HAS_ESCAPE = (1 << 0), /**< binding has escape */
+  LEXER_LIT_LOCATION_IS_ASCII = (1 << 1),   /**< all characters are ascii characters */
+} lexer_lit_location_flags_t;
+
 /**
  * Lexer character (string / identifier) literal data.
  */
@@ -313,7 +323,7 @@ typedef struct
   const uint8_t *char_p;                     /**< start of identifier or string token */
   prop_length_t length;                      /**< length or index of a literal */
   uint8_t type;                              /**< type of the current literal */
-  uint8_t has_escape;                        /**< has escape sequences */
+  uint8_t status_flags;                      /**< any combination of lexer_lit_location_flags_t status bits */
 } lexer_lit_location_t;
 
 /**

diff --git a/jerry-core/parser/js/js-parser-expr.c b/jerry-core/parser/js/js-parser-expr.c
@@ -1922,7 +1922,7 @@ parser_parse_unary_expression (parser_context_t *context_p, /**< context */
         }
 #endif /* JERRY_MODULE_SYSTEM */
 
-        if (JERRY_UNLIKELY (context_p->token.lit_location.has_escape))
+        if (JERRY_UNLIKELY (context_p->token.lit_location.status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE))
         {
           parser_raise_error (context_p, PARSER_ERR_INVALID_KEYWORD);
         }
@@ -2281,7 +2281,7 @@ parser_parse_unary_expression (parser_context_t *context_p, /**< context */
       JERRY_ASSERT ((context_p->status_flags & PARSER_IS_GENERATOR_FUNCTION)
                     && !(context_p->status_flags & PARSER_DISALLOW_AWAIT_YIELD));
 
-      if (context_p->token.lit_location.has_escape)
+      if (context_p->token.lit_location.status_flags & LEXER_LIT_LOCATION_HAS_ESCAPE)
       {
         parser_raise_error (context_p, PARSER_ERR_INVALID_KEYWORD);
       }

diff --git a/jerry-core/parser/js/js-parser-internal.h b/jerry-core/parser/js/js-parser-internal.h
@@ -724,6 +724,7 @@ void parser_emit_cbc_forward_branch (parser_context_t *context_p, uint16_t opcod
 parser_branch_node_t *parser_emit_cbc_forward_branch_item (parser_context_t *context_p, uint16_t opcode,
                                                            parser_branch_node_t *next_p);
 void parser_emit_cbc_backward_branch (parser_context_t *context_p, uint16_t opcode, uint32_t offset);
+ecma_string_t *parser_new_ecma_string_from_literal (lexer_literal_t *literal_p);
 void parser_set_branch_to_current_position (parser_context_t *context_p, parser_branch_t *branch_p);
 void parser_set_breaks_to_current_position (parser_context_t *context_p, parser_branch_node_t *current_p);
 void parser_set_continues_to_current_position (parser_context_t *context_p, parser_branch_node_t *current_p);