From 81aa8b7135c5806c06d917cd0ae627089297ff12 Mon Sep 17 00:00:00 2001 From: sbulen Date: Mon, 7 Dec 2020 22:43:26 -0800 Subject: [PATCH 01/26] Installer & upgrader to utf8mb4 Signed by Shawn Bulen, bulens@pacbell.net --- other/Settings.php | 4 ++-- other/Settings_bak.php | 4 ++-- other/install.php | 4 ++-- other/install_2-1_mysql.sql | 6 +++--- other/upgrade.php | 13 +++++++------ other/upgrade_2-1_mysql.sql | 37 +++++++++++++++++++++++++++++++++++-- 6 files changed, 51 insertions(+), 17 deletions(-) diff --git a/other/Settings.php b/other/Settings.php index 08d6c64248..362e2ac87f 100644 --- a/other/Settings.php +++ b/other/Settings.php @@ -146,7 +146,7 @@ * * @var null|bool */ -$db_mb4 = null; +$db_mb4 = true; ########## Cache Info ########## /** @@ -240,7 +240,7 @@ ######### Legacy Settings ######### # UTF-8 is now the only character set supported in 2.1. -$db_character_set = 'utf8'; +$db_character_set = 'utf8mb4'; ########## Error-Catching ########## # Note: You shouldn't touch these settings. diff --git a/other/Settings_bak.php b/other/Settings_bak.php index 6457a46346..54f47a1f4a 100644 --- a/other/Settings_bak.php +++ b/other/Settings_bak.php @@ -146,7 +146,7 @@ * * @var null|bool */ -$db_mb4 = null; +$db_mb4 = true; ########## Cache Info ########## /** @@ -239,7 +239,7 @@ ######### Legacy Settings ######### # UTF-8 is now the only character set supported in 2.1. -$db_character_set = 'utf8'; +$db_character_set = 'utf8mb4'; ########## Error-Catching ########## # Note: You shouldn't touch these settings. diff --git a/other/install.php b/other/install.php index c27275d35c..5fdf1ca86a 100644 --- a/other/install.php +++ b/other/install.php @@ -1211,8 +1211,8 @@ function DatabasePopulation() $replaces['{$memory}'] = (!$has_innodb && in_array('MEMORY', $engines)) ? 'MEMORY' : $replaces['{$engine}']; // UTF-8 is required. - $replaces['{$engine}'] .= ' DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci'; - $replaces['{$memory}'] .= ' DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci'; + $replaces['{$engine}'] .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; + $replaces['{$memory}'] .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; // One last thing - if we don't have InnoDB, we can't do transactions... if (!$has_innodb) diff --git a/other/install_2-1_mysql.sql b/other/install_2-1_mysql.sql index 9f4942bab4..dc6ddfda89 100644 --- a/other/install_2-1_mysql.sql +++ b/other/install_2-1_mysql.sql @@ -740,8 +740,8 @@ CREATE TABLE {$db_prefix}members ( tfa_backup VARCHAR(64) NOT NULL DEFAULT '', PRIMARY KEY (id_member), INDEX idx_member_name (member_name), - INDEX idx_real_name (real_name), - INDEX idx_email_address (email_address), + INDEX idx_real_name (real_name(191)), + INDEX idx_email_address (email_address(191)), INDEX idx_date_registered (date_registered), INDEX idx_id_group (id_group), INDEX idx_birthdate (birthdate), @@ -985,7 +985,7 @@ CREATE TABLE {$db_prefix}qanda ( question VARCHAR(255) NOT NULL DEFAULT '', answers TEXT NOT NULL, PRIMARY KEY (id_question), - INDEX idx_lngfile (lngfile) + INDEX idx_lngfile (lngfile(191)) ) ENGINE={$engine}; # diff --git a/other/upgrade.php b/other/upgrade.php index b873620b16..51fa4c011c 100644 --- a/other/upgrade.php +++ b/other/upgrade.php @@ -2043,7 +2043,7 @@ function($errno, $errstr, $errfile, $errline) use ($support_js) // If we're on MySQL, set {db_collation}; this approach is used throughout upgrade_2-0_mysql.php to set new tables to utf8 // Note it is expected to be in the format: ENGINE=MyISAM{$db_collation}; if ($db_type == 'mysql') - $db_collation = ' DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci'; + $db_collation = ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; else $db_collation = ''; @@ -2057,7 +2057,7 @@ function($errno, $errstr, $errfile, $errline) use ($support_js) $last_step = ''; // Make sure all newly created tables will have the proper characters set; this approach is used throughout upgrade_2-1_mysql.php - $lines = str_replace(') ENGINE=MyISAM;', ') ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci;', $lines); + $lines = str_replace(') ENGINE=MyISAM;', ') ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;', $lines); // Count the total number of steps within this file - for progress. $file_steps = substr_count(implode('', $lines), '---#'); @@ -3175,8 +3175,8 @@ function ConvertUtf8() { list($charset) = explode('_', $collation); - // Build structure of columns to operate on organized by charset; only operate on columns not yet utf8 - if ($charset != 'utf8') + // Build structure of columns to operate on organized by charset; only operate on columns not yet utf8mb4 + if ($charset != 'utf8mb4') { if (!isset($table_charsets[$charset])) $table_charsets[$charset] = array(); @@ -3188,7 +3188,7 @@ function ConvertUtf8() } $smcFunc['db_free_result']($queryColumns); - // Only change the non-utf8 columns identified above + // Only change the non-utf8mb4 columns identified above if (count($table_charsets) > 0) { $updates_blob = ''; @@ -3253,7 +3253,7 @@ function ConvertUtf8() $smcFunc['db_query']('', ' ALTER TABLE {raw:table_name} - CONVERT TO CHARACTER SET utf8', + CONVERT TO CHARACTER SET utf8mb4', array( 'table_name' => $table_info['Name'], ) @@ -3281,6 +3281,7 @@ function ConvertUtf8() require_once($sourcedir . '/Subs.php'); require_once($sourcedir . '/Subs-Admin.php'); updateSettingsFile(array('db_character_set' => 'utf8')); + updateSettingsFile(array('db_mb4' => true)); // The conversion might have messed up some serialized strings. Fix them! $request = $smcFunc['db_query']('', ' diff --git a/other/upgrade_2-1_mysql.sql b/other/upgrade_2-1_mysql.sql index bf84c9cc38..44b4f4a727 100644 --- a/other/upgrade_2-1_mysql.sql +++ b/other/upgrade_2-1_mysql.sql @@ -1817,7 +1817,7 @@ CREATE TABLE IF NOT EXISTS {$db_prefix}qanda ( question VARCHAR(255) NOT NULL DEFAULT '', answers TEXT NOT NULL, PRIMARY KEY (id_question), - INDEX idx_lngfile (lngfile) + INDEX idx_lngfile (lngfile(191)) ) ENGINE=MyISAM; ---# @@ -3813,4 +3813,37 @@ foreach($files AS $filename) unset($_GET['last_action_id']); unset($_GET['total_fixes']); ---} ----# \ No newline at end of file +---# + +/******************************************************************************/ +--- Prepare indexes for mb4 +/******************************************************************************/ +---# real_name column drop +ALTER TABLE {$db_prefix}members +DROP INDEX idx_real_name; +---# + +---# real_name column recreate +ALTER TABLE {$db_prefix}members +ADD INDEX idx_real_name (real_name(191)); +---# + +---# email column drop +ALTER TABLE {$db_prefix}members +DROP INDEX idx_email_address; +---# + +---# email column recreate +ALTER TABLE {$db_prefix}members +ADD INDEX idx_email_address (email_address(191)); +---# + +---# lngfile column drop +ALTER TABLE {$db_prefix}qanda +DROP INDEX idx_lngfile; +---# + +---# lngfile column recreate +ALTER TABLE {$db_prefix}qanda +ADD INDEX idx_lngfile (lngfile(191)); +---# From 2a3c0efc7d2e8b6d8d98d4d6afbb0307fbb8f3c4 Mon Sep 17 00:00:00 2001 From: sbulen Date: Tue, 8 Dec 2020 16:56:40 -0800 Subject: [PATCH 02/26] UTF8MB4 maint functions Signed by Shawn Bulen, bulens@pacbell.net --- Sources/ManageMaintenance.php | 188 +++++++++++++++++- Themes/default/ManageMaintenance.template.php | 19 +- .../languages/ManageMaintenance.english.php | 4 + 3 files changed, 209 insertions(+), 2 deletions(-) diff --git a/Sources/ManageMaintenance.php b/Sources/ManageMaintenance.php index 74ceb5133f..e5d2b24376 100644 --- a/Sources/ManageMaintenance.php +++ b/Sources/ManageMaintenance.php @@ -63,6 +63,7 @@ function ManageMaintenance() 'activities' => array( 'optimize' => 'OptimizeTables', 'convertentities' => 'ConvertEntities', + 'convertutf8mb4' => 'ConvertUtf8mb4', 'convertmsgbody' => 'ConvertMsgBody', ), ), @@ -126,9 +127,10 @@ function ManageMaintenance() */ function MaintainDatabase() { - global $context, $db_type, $db_character_set, $modSettings, $smcFunc, $txt; + global $context, $db_type, $db_character_set, $db_mb4, $modSettings, $smcFunc, $txt; // Show some conversion options? + $context['convert_utf8mb4'] = $db_type == 'mysql' && $db_mb4 == false; $context['convert_entities'] = isset($modSettings['global_character_set']) && $modSettings['global_character_set'] === 'UTF-8'; if ($db_type == 'mysql') @@ -146,6 +148,8 @@ function MaintainDatabase() if (isset($_GET['done']) && $_GET['done'] == 'convertentities') $context['maintenance_finished'] = $txt['entity_convert_title']; + elseif (isset($_GET['done']) && $_GET['done'] == 'convertutf8mb4') + $context['maintenance_finished'] = $txt['utf8_title']; } /** @@ -469,6 +473,188 @@ function ConvertMsgBody() } } +/** + * Converts all text columns from utf8_general_ci to utf8mb4_general_ci. + * Assumption: This forum has undergone a UTF8 conversion. + * + * This action is linked from the maintenance screen (if applicable). + * It is accessed by ?action=admin;area=maintain;sa=database;activity=convertutf8mb4. + * + * @uses template_convert_utf8mb4() + */ +function ConvertUtf8mb4() +{ + global $scripturl, $context, $txt, $language, $db_character_set, $db_name; + global $modSettings, $user_info, $sourcedir, $smcFunc, $db_prefix; + + // Show me your badge! + isAllowedTo('admin_forum'); + + // Confirm utf8mb4 is supported + $request = $smcFunc['db_query']('', ' + SHOW CHARACTER SET', + array( + ) + ); + + $db_charsets = array(); + while ($row = $smcFunc['db_fetch_assoc']($request)) + $db_charsets[] = $row['Charset']; + $smcFunc['db_free_result']($request); + + if (!in_array('utf8mb4', $db_charsets)) + fatal_lang_error('utf8_charset_not_supported'); + + // Identify all tables + // Note we do all tables in order to set default collation at table level + $request = $smcFunc['db_query']('', ' + SELECT DISTINCT TABLE_NAME + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = {string:cur_schema} + AND TABLE_NAME LIKE {string:table_pattern}', + array( + 'cur_schema' => $db_name, + 'table_pattern' => $db_prefix . '%', + ) + ); + + $db_tables = array(); + while ($row = $smcFunc['db_fetch_assoc']($request)) + $db_tables[] = $row['TABLE_NAME']; + $smcFunc['db_free_result']($request); + + // Check each of the three indexes that may need updating - #1 + $request = $smcFunc['db_query']('', ' + SELECT SUB_PART + FROM information_schema.STATISTICS + WHERE TABLE_NAME = {string:cur_table} + AND TABLE_SCHEMA = {string:cur_schema} + AND INDEX_NAME = \'idx_real_name\' + AND COLUMN_NAME = \'real_name\'', + array( + 'cur_schema' => $db_name, + 'cur_table' => $db_prefix . 'members', + ) + ); + + list($fix_real_name) = $smcFunc['db_fetch_row']($request); + if ($fix_real_name == null) + $fix_real_name = true; + else + $fix_real_name = false; + + $smcFunc['db_free_result']($request); + + // Check each of the three indexes that may need updating - #2 + $request = $smcFunc['db_query']('', ' + SELECT SUB_PART + FROM information_schema.STATISTICS + WHERE TABLE_NAME = {string:cur_table} + AND TABLE_SCHEMA = {string:cur_schema} + AND INDEX_NAME = \'idx_email_address\' + AND COLUMN_NAME = \'email_address\'', + array( + 'cur_schema' => $db_name, + 'cur_table' => $db_prefix . 'members', + ) + ); + + list($fix_email_address) = $smcFunc['db_fetch_row']($request); + if ($fix_email_address == null) + $fix_email_address = true; + else + $fix_email_address = false; + + $smcFunc['db_free_result']($request); + + // Check each of the three indexes that may need updating - #3 + $request = $smcFunc['db_query']('', ' + SELECT SUB_PART + FROM information_schema.STATISTICS + WHERE TABLE_NAME = {string:cur_table} + AND TABLE_SCHEMA = {string:cur_schema} + AND INDEX_NAME = \'idx_lngfile\' + AND COLUMN_NAME = \'lngfile\'', + array( + 'cur_schema' => $db_name, + 'cur_table' => $db_prefix . 'qanda', + ) + ); + + list($fix_lngfile) = $smcFunc['db_fetch_row']($request); + if ($fix_lngfile == null) + $fix_lngfile = true; + else + $fix_lngfile = false; + + $smcFunc['db_free_result']($request); + + // After this point we are starting the conversion. But first: session check. + checkSession(); + + // First, drop the three indexes if they need fixing... + if ($fix_real_name) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + DROP INDEX idx_real_name', + array( + ) + ); + if ($fix_email_address) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + DROP INDEX idx_email_address', + array( + ) + ); + if ($fix_lngfile) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}qanda + DROP INDEX idx_lngfile', + array( + ) + ); + + // Next, loop thru & fix each table + foreach ($db_tables AS $cur_table) + { + $request = $smcFunc['db_query']('', ' + ALTER TABLE ' . $cur_table . ' CONVERT TO CHARACTER SET \'utf8mb4\' COLLATE \'utf8mb4_general_ci\'', + array( + ) + ); + } + + // Next, fix the three indexes... + if ($fix_real_name) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + ADD INDEX idx_real_name (real_name(191))', + array( + ) + ); + if ($fix_email_address) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + ADD INDEX idx_email_address (email_address(191))', + array( + ) + ); + if ($fix_lngfile) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}qanda + ADD INDEX idx_lngfile (lngfile(191))', + array( + ) + ); + + // Finally, note we are now mb4 in Settings.php + require_once($sourcedir . '/Subs-Admin.php'); + updateSettingsFile(array('db_character_set' => 'utf8mb4', 'db_mb4' => true)); + + redirectexit('action=admin;area=maintain;sa=database;done=convertutf8mb4'); +} + /** * Converts HTML-entities to their UTF-8 character equivalents. * This requires the admin_forum permission. diff --git a/Themes/default/ManageMaintenance.template.php b/Themes/default/ManageMaintenance.template.php index ffef2c8a76..c0885d2283 100644 --- a/Themes/default/ManageMaintenance.template.php +++ b/Themes/default/ManageMaintenance.template.php @@ -38,6 +38,23 @@ function template_maintain_database() '; + // Show an option to convert to UTF8MB4 if we're not on UTF8MB4 yet. + if ($context['convert_utf8mb4']) + { + echo ' +
+

', $txt['utf8_title'], '

+
+
+
+

', $txt['utf8_introduction'], '

+ + + +
+
'; + } + // Show an option to convert the body column of the post table to MEDIUMTEXT or TEXT if (isset($context['convert_to'])) echo ' @@ -576,7 +593,7 @@ function template_convert_entities() } /** - * Template for converting posts to UTF-8. + * Template for changing message body from TEXT to MEDIUMTEXT. */ function template_convert_msgbody() { diff --git a/Themes/default/languages/ManageMaintenance.english.php b/Themes/default/languages/ManageMaintenance.english.php index 1be7044b9a..fa8cd2f9d9 100644 --- a/Themes/default/languages/ManageMaintenance.english.php +++ b/Themes/default/languages/ManageMaintenance.english.php @@ -201,6 +201,10 @@ $txt['convert_to_text'] = 'No messages are longer than 65535 characters. You can safely proceed with the conversion without losing any part of the text.'; $txt['convert_to_suggest_text'] = 'The messages body column in your database is currently set as MEDIUMTEXT, but the maximum allowed length set for the messages is lower than 65535 characters. You may free some space converting the column to TEXT.'; +$txt['utf8_title'] = 'Convert the database and data to UTF8MB4'; +$txt['utf8_introduction'] = 'UTF8MB4 is an international character set covering nearly all languages around the world. Converting your database and data to UTF8MB4 can make it easier to support multiple languages on the same board. It also can enhance search and sorting capabilities for languages with non-latin characters.'; +$txt['utf8_charset_not_supported'] = 'Conversion to UTF8MB4 is not supported.'; + $txt['entity_convert_title'] = 'Convert HTML-entities to UTF-8 characters'; $txt['entity_convert_only_utf8'] = 'The database needs to be in UTF-8 format before HTML-entities can be converted to UTF-8'; $txt['entity_convert_introduction'] = 'This function will convert all characters that are stored in the database as HTML-entities to UTF-8 characters. This is especially useful when you have just converted your forum from a character set like ISO-8859-1 while non-latin characters were used on the forum. The browser then sends all characters as HTML-entities. For example, the HTML-entity &#945; represents the greek letter α (alpha). Converting entities to UTF-8 will improve searching and sorting of text and reduce storage size.'; From eef61859e777783e43a0f90f47419e779878c1f6 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 17:12:48 -0800 Subject: [PATCH 03/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Load.php | 55 ++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/Sources/Load.php b/Sources/Load.php index 3d0eb4f261..392964608a 100644 --- a/Sources/Load.php +++ b/Sources/Load.php @@ -100,10 +100,6 @@ function reloadSettings() if (empty($modSettings['force_ssl'])) $image_proxy_enabled = false; - // UTF-8 ? - $utf8 = (empty($modSettings['global_character_set']) ? $txt['lang_character_set'] : $modSettings['global_character_set']) === 'UTF-8'; - $context['utf8'] = $utf8; - // Set a list of common functions. $ent_list = '&(?:#' . (empty($modSettings['disableEntityCheck']) ? '\d{1,7}' : '021') . '|quot|amp|lt|gt|nbsp);'; $ent_check = empty($modSettings['disableEntityCheck']) ? function($string) @@ -114,9 +110,9 @@ function reloadSettings() { return (string) $string; }; - $fix_utf8mb4 = function($string) use ($utf8, $smcFunc) + $fix_utf8mb4 = function($string) use ($smcFunc) { - if (!$utf8 || $smcFunc['db_mb4']) + if ($smcFunc['db_mb4']) return $string; $i = 0; @@ -161,26 +157,26 @@ function reloadSettings() $num = $string[0] === 'x' ? hexdec(substr($string, 1)) : (int) $string; return $num < 0x20 || $num > 0x10FFFF || ($num >= 0xD800 && $num <= 0xDFFF) || $num === 0x202E || $num === 0x202D ? '' : '&#' . $num . ';'; }, - 'htmlspecialchars' => function($string, $quote_style = ENT_COMPAT, $charset = 'ISO-8859-1') use ($ent_check, $utf8, $fix_utf8mb4, &$smcFunc) + 'htmlspecialchars' => function($string, $quote_style = ENT_COMPAT, $charset = 'UTF-8') use ($ent_check, $fix_utf8mb4, &$smcFunc) { $string = $smcFunc['normalize']($string); - return $fix_utf8mb4($ent_check(htmlspecialchars($string, $quote_style, $utf8 ? 'UTF-8' : $charset))); + return $fix_utf8mb4($ent_check(htmlspecialchars($string, $quote_style, $charset))); }, - 'htmltrim' => function($string) use ($utf8, $ent_check) + 'htmltrim' => function($string) use ($ent_check) { // Preg_replace space characters depend on the character set in use - $space_chars = $utf8 ? '\p{Z}\p{C}' : '\x00-\x20\x80-\xA0'; + $space_chars = '\p{Z}\p{C}'; - return preg_replace('~^(?:[' . $space_chars . ']| )+|(?:[' . $space_chars . ']| )+$~' . ($utf8 ? 'u' : ''), '', $ent_check($string)); + return preg_replace('~^(?:[' . $space_chars . ']| )+|(?:[' . $space_chars . ']| )+$~u', '', $ent_check($string)); }, - 'strlen' => function($string) use ($ent_list, $utf8, $ent_check) + 'strlen' => function($string) use ($ent_list, $ent_check) { - return strlen(preg_replace('~' . $ent_list . ($utf8 ? '|.~u' : '~'), '_', $ent_check($string))); + return strlen(preg_replace('~' . $ent_list . '|.~u', '_', $ent_check($string))); }, - 'strpos' => function($haystack, $needle, $offset = 0) use ($utf8, $ent_check, $ent_list, $modSettings) + 'strpos' => function($haystack, $needle, $offset = 0) use ($ent_check, $ent_list, $modSettings) { - $haystack_arr = preg_split('~(' . $ent_list . '|.)~' . ($utf8 ? 'u' : ''), $ent_check($haystack), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); + $haystack_arr = preg_split('~(' . $ent_list . '|.)~u', $ent_check($haystack), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); if (strlen($needle) === 1) { @@ -189,7 +185,7 @@ function reloadSettings() } else { - $needle_arr = preg_split('~(' . $ent_list . '|.)~' . ($utf8 ? 'u' : '') . '', $ent_check($needle), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); + $needle_arr = preg_split('~(' . $ent_list . '|.)~u' . '', $ent_check($needle), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); $needle_size = count($needle_arr); $result = array_search($needle_arr[0], array_slice($haystack_arr, $offset)); @@ -203,12 +199,12 @@ function reloadSettings() return false; } }, - 'substr' => function($string, $start, $length = null) use ($utf8, $ent_check, $ent_list, $modSettings) + 'substr' => function($string, $start, $length = null) use ($ent_check, $ent_list, $modSettings) { - $ent_arr = preg_split('~(' . $ent_list . '|.)~' . ($utf8 ? 'u' : '') . '', $ent_check($string), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); + $ent_arr = preg_split('~(' . $ent_list . '|.)~u' . '', $ent_check($string), -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); return $length === null ? implode('', array_slice($ent_arr, $start)) : implode('', array_slice($ent_arr, $start, $length)); }, - 'strtolower' => $utf8 ? function($string) use ($sourcedir, &$smcFunc) + 'strtolower' => function($string) use ($sourcedir, &$smcFunc) { $string = $smcFunc['normalize']($string); @@ -219,8 +215,8 @@ function reloadSettings() } return mb_strtolower($string, 'UTF-8'); - } : 'strtolower', - 'strtoupper' => $utf8 ? function($string) use ($sourcedir, &$smcFunc) + }, + 'strtoupper' => function($string) use ($sourcedir, &$smcFunc) { $string = $smcFunc['normalize']($string); @@ -231,27 +227,27 @@ function reloadSettings() } return mb_strtoupper($string, 'UTF-8'); - } : 'strtoupper', - 'truncate' => function($string, $length) use ($utf8, $ent_check, $ent_list, &$smcFunc) + }, + 'truncate' => function($string, $length) use ($ent_check, $ent_list, &$smcFunc) { $string = $ent_check($string); - preg_match('~^(' . $ent_list . '|.){' . $smcFunc['strlen'](substr($string, 0, $length)) . '}~' . ($utf8 ? 'u' : ''), $string, $matches); + preg_match('~^(' . $ent_list . '|.){' . $smcFunc['strlen'](substr($string, 0, $length)) . '}~u', $string, $matches); $string = $matches[0]; while (strlen($string) > $length) - $string = preg_replace('~(?:' . $ent_list . '|.)$~' . ($utf8 ? 'u' : ''), '', $string); + $string = preg_replace('~(?:' . $ent_list . '|.)$~u', '', $string); return $string; }, - 'ucfirst' => $utf8 ? function($string) use (&$smcFunc) + 'ucfirst' => function($string) use (&$smcFunc) { return $smcFunc['strtoupper']($smcFunc['substr']($string, 0, 1)) . $smcFunc['substr']($string, 1); - } : 'ucfirst', - 'ucwords' => $utf8 ? function($string) use (&$smcFunc) + }, + 'ucwords' => function($string) use (&$smcFunc) { $words = preg_split('~([\s\r\n\t]+)~', $string, -1, PREG_SPLIT_DELIM_CAPTURE); for ($i = 0, $n = count($words); $i < $n; $i += 2) $words[$i] = $smcFunc['ucfirst']($words[$i]); return implode('', $words); - } : 'ucwords', + }, 'json_decode' => 'smf_json_decode', 'json_encode' => 'json_encode', 'random_int' => function($min = 0, $max = PHP_INT_MAX) @@ -3259,7 +3255,6 @@ function getBoardParents($id_parent) /** * Attempt to reload our known languages. - * It will try to choose only utf8 or non-utf8 languages. * * @param bool $use_cache Whether or not to use the cache * @return array An array of information about available languages From 4ffa40d6e5138ac68d2cafb8c55ad4495833a351 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 17:17:45 -0800 Subject: [PATCH 04/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/DbPackages-mysql.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Sources/DbPackages-mysql.php b/Sources/DbPackages-mysql.php index d4e2a5542f..67151a6861 100644 --- a/Sources/DbPackages-mysql.php +++ b/Sources/DbPackages-mysql.php @@ -199,6 +199,8 @@ function smf_db_create_table($table_name, $columns, $indexes = array(), $paramet $table_query .= ') ENGINE=' . $parameters['engine']; if (!empty($db_character_set) && $db_character_set == 'utf8') $table_query .= ' DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci'; + else + $table_query .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; // Create the table! $smcFunc['db_query']('', $table_query, From ddec44278ed528af49202e406320f6316c9a598c Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 17:20:58 -0800 Subject: [PATCH 05/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Profile-Export.php | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Sources/Profile-Export.php b/Sources/Profile-Export.php index 34c9b0428b..67e21b077c 100644 --- a/Sources/Profile-Export.php +++ b/Sources/Profile-Export.php @@ -534,21 +534,18 @@ function ($datatype) use ($txt) header('etag: ' . $eTag); header('content-type: ' . $export_formats[$_GET['format']]['mime']); - // Convert the file to UTF-8, cuz most browsers dig that. - $utf8name = !$context['utf8'] && function_exists('iconv') ? iconv($context['character_set'], 'UTF-8', $dlbasename) : (!$context['utf8'] && function_exists('mb_convert_encoding') ? mb_convert_encoding($dlbasename, 'UTF-8', $context['character_set']) : $dlbasename); - // Different browsers like different standards... if (isBrowser('firefox')) - header('content-disposition: attachment; filename*=UTF-8\'\'' . rawurlencode(preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $utf8name))); + header('content-disposition: attachment; filename*=UTF-8\'\'' . rawurlencode(preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $dlbasename))); elseif (isBrowser('opera')) - header('content-disposition: attachment; filename="' . preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $utf8name) . '"'); + header('content-disposition: attachment; filename="' . preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $dlbasename) . '"'); elseif (isBrowser('ie')) - header('content-disposition: attachment; filename="' . urlencode(preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $utf8name)) . '"'); + header('content-disposition: attachment; filename="' . urlencode(preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $dlbasename)) . '"'); else - header('content-disposition: attachment; filename="' . $utf8name . '"'); + header('content-disposition: attachment; filename="' . $dlbasename . '"'); header('cache-control: max-age=' . (525600 * 60) . ', private'); From d2bfbe8ca14522754edc841ac4b6828124b3e797 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 20:40:55 -0800 Subject: [PATCH 06/26] All utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs-Post.php | 56 ++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/Sources/Subs-Post.php b/Sources/Subs-Post.php index 6896f73094..cdb6f7bd67 100644 --- a/Sources/Subs-Post.php +++ b/Sources/Subs-Post.php @@ -45,13 +45,8 @@ function preparsecode(&$message, $previewing = false) ); $message = strtr($message, $control_replacements); - // This line makes all languages *theoretically* work even with the wrong charset ;). - if (empty($context['utf8'])) - $message = preg_replace('~&#(\d{4,5}|[2-9]\d{2,4}|1[2-9]\d);~', '&#$1;', $message); - // Normalize Unicode characters for storage efficiency, better searching, etc. - else - $message = $smcFunc['normalize']($message); + $message = $smcFunc['normalize']($message); // Clean out any other funky stuff. $message = sanitize_chars($message, 0); @@ -131,7 +126,7 @@ function($a) $message = implode('', $parts); // The regular expression non breaking space has many versions. - $non_breaking_space = $context['utf8'] ? '\x{A0}' : '\xA0'; + $non_breaking_space = '\x{A0}'; // Now that we've fixed all the code tags, let's fix the img and url tags... fixTags($message); @@ -216,25 +211,25 @@ function($m) $mistake_fixes = array( // Find [table]s not followed by [tr]. - '~\[table\](?![\s' . $non_breaking_space . ']*\[tr\])~s' . ($context['utf8'] ? 'u' : '') => '[table][tr]', + '~\[table\](?![\s' . $non_breaking_space . ']*\[tr\])~su' => '[table][tr]', // Find [tr]s not followed by [td]. - '~\[tr\](?![\s' . $non_breaking_space . ']*\[td\])~s' . ($context['utf8'] ? 'u' : '') => '[tr][td]', + '~\[tr\](?![\s' . $non_breaking_space . ']*\[td\])~su' => '[tr][td]', // Find [/td]s not followed by something valid. - '~\[/td\](?![\s' . $non_breaking_space . ']*(?:\[td\]|\[/tr\]|\[/table\]))~s' . ($context['utf8'] ? 'u' : '') => '[/td][/tr]', + '~\[/td\](?![\s' . $non_breaking_space . ']*(?:\[td\]|\[/tr\]|\[/table\]))~su' => '[/td][/tr]', // Find [/tr]s not followed by something valid. - '~\[/tr\](?![\s' . $non_breaking_space . ']*(?:\[tr\]|\[/table\]))~s' . ($context['utf8'] ? 'u' : '') => '[/tr][/table]', + '~\[/tr\](?![\s' . $non_breaking_space . ']*(?:\[tr\]|\[/table\]))~su' => '[/tr][/table]', // Find [/td]s incorrectly followed by [/table]. - '~\[/td\][\s' . $non_breaking_space . ']*\[/table\]~s' . ($context['utf8'] ? 'u' : '') => '[/td][/tr][/table]', + '~\[/td\][\s' . $non_breaking_space . ']*\[/table\]~su' => '[/td][/tr][/table]', // Find [table]s, [tr]s, and [/td]s (possibly correctly) followed by [td]. - '~\[(table|tr|/td)\]([\s' . $non_breaking_space . ']*)\[td\]~s' . ($context['utf8'] ? 'u' : '') => '[$1]$2[_td_]', + '~\[(table|tr|/td)\]([\s' . $non_breaking_space . ']*)\[td\]~su' => '[$1]$2[_td_]', // Now, any [td]s left should have a [tr] before them. '~\[td\]~s' => '[tr][td]', // Look for [tr]s which are correctly placed. - '~\[(table|/tr)\]([\s' . $non_breaking_space . ']*)\[tr\]~s' . ($context['utf8'] ? 'u' : '') => '[$1]$2[_tr_]', + '~\[(table|/tr)\]([\s' . $non_breaking_space . ']*)\[tr\]~su' => '[$1]$2[_tr_]', // Any remaining [tr]s should have a [table] before them. '~\[tr\]~s' => '[table][tr]', // Look for [/td]s followed by [/tr]. - '~\[/td\]([\s' . $non_breaking_space . ']*)\[/tr\]~s' . ($context['utf8'] ? 'u' : '') => '[/td]$1[_/tr_]', + '~\[/td\]([\s' . $non_breaking_space . ']*)\[/tr\]~su' => '[/td]$1[_/tr_]', // Any remaining [/tr]s should have a [/td]. '~\[/tr\]~s' => '[/td][/tr]', // Look for properly opened [li]s which aren't closed. @@ -242,14 +237,14 @@ function($m) '~\[li\]([^\[\]]+?)\[/list\]~s' => '[_li_]$1[_/li_][/list]', '~\[li\]([^\[\]]+?)$~s' => '[li]$1[/li]', // Lists - find correctly closed items/lists. - '~\[/li\]([\s' . $non_breaking_space . ']*)\[/list\]~s' . ($context['utf8'] ? 'u' : '') => '[_/li_]$1[/list]', + '~\[/li\]([\s' . $non_breaking_space . ']*)\[/list\]~su' => '[_/li_]$1[/list]', // Find list items closed and then opened. - '~\[/li\]([\s' . $non_breaking_space . ']*)\[li\]~s' . ($context['utf8'] ? 'u' : '') => '[_/li_]$1[_li_]', + '~\[/li\]([\s' . $non_breaking_space . ']*)\[li\]~su' => '[_/li_]$1[_li_]', // Now, find any [list]s or [/li]s followed by [li]. - '~\[(list(?: [^\]]*?)?|/li)\]([\s' . $non_breaking_space . ']*)\[li\]~s' . ($context['utf8'] ? 'u' : '') => '[$1]$2[_li_]', + '~\[(list(?: [^\]]*?)?|/li)\]([\s' . $non_breaking_space . ']*)\[li\]~su' => '[$1]$2[_li_]', // Allow for sub lists. - '~\[/li\]([\s' . $non_breaking_space . ']*)\[list\]~' . ($context['utf8'] ? 'u' : '') => '[_/li_]$1[list]', - '~\[/list\]([\s' . $non_breaking_space . ']*)\[li\]~' . ($context['utf8'] ? 'u' : '') => '[/list]$1[_li_]', + '~\[/li\]([\s' . $non_breaking_space . ']*)\[list\]~u' => '[_/li_]$1[list]', + '~\[/list\]([\s' . $non_breaking_space . ']*)\[li\]~u' => '[/list]$1[_li_]', // Any remaining [li]s weren't inside a [list]. '~\[li\]~' => '[list][li]', // Any remaining [/li]s weren't before a [/list]. @@ -287,9 +282,9 @@ function($m) // Restore white space entities if (!$previewing) - $message = strtr($message, array(' ' => '  ', "\n" => '
', $context['utf8'] ? "\xC2\xA0" : "\xA0" => ' ')); + $message = strtr($message, array(' ' => '  ', "\n" => '
', "\xC2\xA0" => ' ')); else - $message = strtr($message, array(' ' => '  ', $context['utf8'] ? "\xC2\xA0" : "\xA0" => ' ')); + $message = strtr($message, array(' ' => '  ', "\xC2\xA0" => ' ')); // Now let's quickly clean up things that will slow our parser (which are common in posted code.) $message = strtr($message, array('[]' => '[]', '['' => '['')); @@ -1308,14 +1303,6 @@ function($m) ); else { - // Try to convert the string to UTF-8. - if (!$context['utf8'] && function_exists('iconv')) - { - $newstring = @iconv($context['character_set'], 'UTF-8', $string); - if ($newstring) - $string = $newstring; - } - $string = preg_replace_callback('~&#(\d{3,8});~', 'fixchar__callback', $string); // Unicode, baby. @@ -1324,15 +1311,8 @@ function($m) } // Convert all special characters to HTML entities...just for Hotmail :-\ - if ($hotmail_fix && ($context['utf8'] || function_exists('iconv') || $context['character_set'] === 'ISO-8859-1')) + if ($hotmail_fix && function_exists('iconv')) { - if (!$context['utf8'] && function_exists('iconv')) - { - $newstring = @iconv($context['character_set'], 'UTF-8', $string); - if ($newstring) - $string = $newstring; - } - $entityConvert = function($m) { $c = $m[1]; From a95298f0b0c60147607b79f1fc45dcfc5782c077 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:00:33 -0800 Subject: [PATCH 07/26] All utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs.php | 79 +++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 55 deletions(-) diff --git a/Sources/Subs.php b/Sources/Subs.php index 7992b4800f..9f48f83770 100644 --- a/Sources/Subs.php +++ b/Sources/Subs.php @@ -1520,10 +1520,6 @@ function parse_bbc($message, $smileys = true, $cache_id = '', $parse_tags = arra if ($message === '') return ''; - // Just in case it wasn't determined yet whether UTF-8 is enabled. - if (!isset($context['utf8'])) - $context['utf8'] = (empty($modSettings['global_character_set']) ? $txt['lang_character_set'] : $modSettings['global_character_set']) === 'UTF-8'; - // Clean up any cut/paste issues we may have $message = sanitizeMSCutPaste($message); @@ -3900,7 +3896,7 @@ function parsesmileys(&$message) list ($smileysfrom, $smileysto, $smileysdescs) = $temp; // The non-breaking-space is a complex thing... - $non_breaking_space = $context['utf8'] ? '\x{A0}' : '\xA0'; + $non_breaking_space = '\x{A0}'; // This smiley regex makes sure it doesn't parse smileys within code tags (so [url=mailto:David@bla.com] doesn't parse the :D smiley) $smileyPregReplacements = array(); @@ -3930,7 +3926,7 @@ function parsesmileys(&$message) } } - $smileyPregSearch = '~(?<=[>:\?\.\s' . $non_breaking_space . '[\]()*\\\;]|(?:\?\.\s' . $non_breaking_space . '[\]()*\\\;]|(?!@$%^*.,:+=`\~\?/\\\\]+|&(?:amp|lt|gt|quot);)+~' . ($context['utf8'] ? 'u' : ''), ' ', strtr($text, array('
' => ' '))); + $words = preg_replace('~(?:[\x0B\0' . '\x{A0}' . '\t\r\s\n(){}\\[\\]<>!@$%^*.,:+=`\~\?/\\\\]+|&(?:amp|lt|gt|quot);)+~u', ' ', strtr($text, array('
' => ' '))); // Step 2: Entities we left to letters, where applicable, lowercase. $words = un_htmlspecialchars($smcFunc['strtolower']($words)); @@ -6406,17 +6404,6 @@ function sanitizeMSCutPaste($string) "\xe2\x80\x9d", // right double curly quote ); - // windows 1252 / iso equivalents - $findchars_iso = array( - chr(130), - chr(132), - chr(133), - chr(145), - chr(146), - chr(147), - chr(148), - ); - // safe replacements $replacechars = array( ',', // ‚ @@ -6428,10 +6415,7 @@ function sanitizeMSCutPaste($string) '"', // ” ); - if ($context['utf8']) - $string = str_replace($findchars_utf8, $replacechars, $string); - else - $string = str_replace($findchars_iso, $replacechars, $string); + $string = str_replace($findchars_utf8, $replacechars, $string); return $string; } @@ -6463,37 +6447,22 @@ function replaceEntities__callback($matches) if (in_array($num, array(0x22, 0x26, 0x27, 0x3C, 0x3E))) return '&#' . $num . ';'; - if (empty($context['utf8'])) - { - // no control characters - if ($num < 0x20) - return ''; - // text is text - elseif ($num < 0x80) - return chr($num); - // all others get html-ised - else - return '&#' . $matches[2] . ';'; - } + // <0x20 are control characters, 0x20 is a space, > 0x10FFFF is past the end of the utf8 character set + // 0xD800 >= $num <= 0xDFFF are surrogate markers (not valid for utf8 text) + if ($num < 0x20 || $num > 0x10FFFF || ($num >= 0xD800 && $num <= 0xDFFF)) + return ''; + // <0x80 (or less than 128) are standard ascii characters a-z A-Z 0-9 and punctuation + elseif ($num < 0x80) + return chr($num); + // <0x800 (2048) + elseif ($num < 0x800) + return chr(($num >> 6) + 192) . chr(($num & 63) + 128); + // < 0x10000 (65536) + elseif ($num < 0x10000) + return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); + // <= 0x10FFFF (1114111) else - { - // <0x20 are control characters, 0x20 is a space, > 0x10FFFF is past the end of the utf8 character set - // 0xD800 >= $num <= 0xDFFF are surrogate markers (not valid for utf8 text) - if ($num < 0x20 || $num > 0x10FFFF || ($num >= 0xD800 && $num <= 0xDFFF)) - return ''; - // <0x80 (or less than 128) are standard ascii characters a-z A-Z 0-9 and punctuation - elseif ($num < 0x80) - return chr($num); - // <0x800 (2048) - elseif ($num < 0x800) - return chr(($num >> 6) + 192) . chr(($num & 63) + 128); - // < 0x10000 (65536) - elseif ($num < 0x10000) - return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); - // <= 0x10FFFF (1114111) - else - return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); - } + return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128); } /** @@ -8349,7 +8318,7 @@ function cleanXml($string) // The Unicode surrogate pair code points should never be present in our // strings to begin with, but if any snuck in, they need to be removed. - if (!empty($context['utf8']) && strpos($string, "\xED") !== false) + if (strpos($string, "\xED") !== false) $string = preg_replace('/\xED[\xA0-\xBF][\x80-\xBF]/', '', $string); return $string; From d7638d34aed6c442413107105be531773872bfa6 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:11:12 -0800 Subject: [PATCH 08/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs-Admin.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Subs-Admin.php b/Sources/Subs-Admin.php index 30a25eb4c1..27ce858e75 100644 --- a/Sources/Subs-Admin.php +++ b/Sources/Subs-Admin.php @@ -1185,7 +1185,7 @@ function updateSettingsFile($config_vars, $keep_quotes = null, $rebuild = false) $var_pattern = count($var_pattern) > 1 ? '(?:' . (implode('|', $var_pattern)) . ')' : $var_pattern[0]; - $substitutions[$var]['search_pattern'] = '~(?<=^|\s)\h*\$' . preg_quote($var, '~') . '\s*=\s*' . $var_pattern . ';~' . (!empty($utf8) ? 'u' : ''); + $substitutions[$var]['search_pattern'] = '~(?<=^|\s)\h*\$' . preg_quote($var, '~') . '\s*=\s*' . $var_pattern . ';~u'; } // Next create the placeholder or replace_pattern. @@ -1249,7 +1249,7 @@ function updateSettingsFile($config_vars, $keep_quotes = null, $rebuild = false) $placeholder = md5($prefix . $var); - $substitutions[$var]['search_pattern'] = '~(?<=^|\s)\h*\$' . preg_quote($var, '~') . '\s*=\s*' . $var_pattern . ';~' . (!empty($utf8) ? 'u' : ''); + $substitutions[$var]['search_pattern'] = '~(?<=^|\s)\h*\$' . preg_quote($var, '~') . '\s*=\s*' . $var_pattern . ';~u'; $substitutions[$var]['placeholder'] = $placeholder; $substitutions[$var]['replacement'] = '$' . $var . ' = ' . smf_var_export($val, true) . ";"; } From de79aa8a8d548fde7cf76c03c5cb851b526890f0 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:17:47 -0800 Subject: [PATCH 09/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/ManageLanguages.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Sources/ManageLanguages.php b/Sources/ManageLanguages.php index d2297c2c2a..9a5a937b2f 100644 --- a/Sources/ManageLanguages.php +++ b/Sources/ManageLanguages.php @@ -1115,13 +1115,13 @@ function($val1, $val2) // Read in the file's contents and process it into entries. // Also, remove any lines for uneditable variables like $forum_copyright from the working data. $entries = array(); - foreach (preg_split('~^(?=\$(?:' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\])~m' . ($context['utf8'] ? 'u' : ''), preg_replace('~\s*\n(\$(?!(?:' . implode('|', $string_types) . '))[^\n]*)~', '', file_get_contents($current_file))) as $blob) + foreach (preg_split('~^(?=\$(?:' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\])~mu', preg_replace('~\s*\n(\$(?!(?:' . implode('|', $string_types) . '))[^\n]*)~', '', file_get_contents($current_file))) as $blob) { // Comment lines at the end of the blob can make terrible messes - $blob = preg_replace('~(\n[ \t]*//[^\n]*)*$~' . ($context['utf8'] ? 'u' : ''), '', $blob); + $blob = preg_replace('~(\n[ \t]*//[^\n]*)*$~u', '', $blob); // Extract the variable - if (preg_match('~^\$(' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\](?:\[\'?([^\n]+?)\'?\])?\s?=\s?(.+);([ \t]*(?://[^\n]*)?)$~ms' . ($context['utf8'] ? 'u' : ''), strtr($blob, array("\r" => '')), $matches)) + if (preg_match('~^\$(' . implode('|', $string_types) . ')\[\'([^\n]+?)\'\](?:\[\'?([^\n]+?)\'?\])?\s?=\s?(.+);([ \t]*(?://[^\n]*)?)$~msu', strtr($blob, array("\r" => '')), $matches)) { // If no valid subkey was found, we need it to be explicitly null $matches[3] = isset($matches[3]) && $matches[3] !== '' ? $matches[3] : null; @@ -1196,7 +1196,7 @@ function($val1, $val2) # Followed by a comma or the end of the string (?=,|$) - /x' . ($context['utf8'] ? 'u' : ''), $entryValue['entry'], $matches); + /xu', $entryValue['entry'], $matches); if (empty($m)) continue; @@ -1427,7 +1427,7 @@ function($val1, $val2) foreach ($final_saves as $save) { if (!empty($save['is_regex'])) - $file_contents = preg_replace('~' . $save['find'] . '~' . ($context['utf8'] ? 'u' : ''), $save['replace'], $file_contents); + $file_contents = preg_replace('~' . $save['find'] . '~u', $save['replace'], $file_contents); else $file_contents = str_replace($save['find'], $save['replace'], $file_contents); } From 89a913d13d185f2c3de3b0b5bc46cec377b472cc Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:19:42 -0800 Subject: [PATCH 10/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/ManageServer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/ManageServer.php b/Sources/ManageServer.php index 6d789a0414..c7d812a807 100644 --- a/Sources/ManageServer.php +++ b/Sources/ManageServer.php @@ -1387,7 +1387,7 @@ function saveSettings(&$config_vars) // Fix the darn stupid cookiename! (more may not be allowed, but these for sure!) if (isset($_POST['cookiename'])) - $_POST['cookiename'] = preg_replace('~[,;\s\.$]+~' . ($context['utf8'] ? 'u' : ''), '', $_POST['cookiename']); + $_POST['cookiename'] = preg_replace('~[,;\s\.$]+~u', '', $_POST['cookiename']); // Fix the forum's URL if necessary. if (isset($_POST['boardurl'])) From 0d64abdab7a9363f6020ff37c762a0e3970efd48 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:26:23 -0800 Subject: [PATCH 11/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Memberlist.php | 2 +- Sources/PersonalMessage.php | 4 ++-- Sources/Search.php | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Sources/Memberlist.php b/Sources/Memberlist.php index abdf366ab5..2dbda3ddcd 100644 --- a/Sources/Memberlist.php +++ b/Sources/Memberlist.php @@ -238,7 +238,7 @@ function MLAll() if (!is_numeric($_REQUEST['start'])) { - if (preg_match('~^[^\'\\\\/]~' . ($context['utf8'] ? 'u' : ''), $smcFunc['strtolower']($_REQUEST['start']), $match) === 0) + if (preg_match('~^[^\'\\\\/]~u', $smcFunc['strtolower']($_REQUEST['start']), $match) === 0) fatal_error('Hacker?', false); $_REQUEST['start'] = $match[0]; diff --git a/Sources/PersonalMessage.php b/Sources/PersonalMessage.php index c81fda78b7..018a1dc13f 100644 --- a/Sources/PersonalMessage.php +++ b/Sources/PersonalMessage.php @@ -1436,11 +1436,11 @@ function MessageSearch2() $context['search_errors']['invalid_search_string'] = true; // Extract phrase parts first (e.g. some words "this is a phrase" some more words.) - preg_match_all('~(?:^|\s)([-]?)"([^"]+)"(?:$|\s)~' . ($context['utf8'] ? 'u' : ''), $search_params['search'], $matches, PREG_PATTERN_ORDER); + preg_match_all('~(?:^|\s)([-]?)"([^"]+)"(?:$|\s)~u', $search_params['search'], $matches, PREG_PATTERN_ORDER); $searchArray = $matches[2]; // Remove the phrase parts and extract the words. - $tempSearch = explode(' ', preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~' . ($context['utf8'] ? 'u' : ''), ' ', $search_params['search'])); + $tempSearch = explode(' ', preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~u', ' ', $search_params['search'])); // A minus sign in front of a word excludes the word.... so... $excludedWords = array(); diff --git a/Sources/Search.php b/Sources/Search.php index e57e7e4ddb..4ed7950488 100644 --- a/Sources/Search.php +++ b/Sources/Search.php @@ -678,7 +678,7 @@ function PlushSearch2() } // Change non-word characters into spaces. - $stripped_query = preg_replace('~(?:[\x0B\0' . ($context['utf8'] ? '\x{A0}' : '\xA0') . '\t\r\s\n(){}\\[\\]<>!@$%^*.,:+=`\~\?/\\\\]+|&(?:amp|lt|gt|quot);)+~' . ($context['utf8'] ? 'u' : ''), ' ', $search_params['search']); + $stripped_query = preg_replace('~(?:[\x0B\0' . '\x{A0}' . '\t\r\s\n(){}\\[\\]<>!@$%^*.,:+=`\~\?/\\\\]+|&(?:amp|lt|gt|quot);)+~u', ' ', $search_params['search']); // Make the query lower case. It's gonna be case insensitive anyway. $stripped_query = un_htmlspecialchars($smcFunc['strtolower']($stripped_query)); @@ -695,7 +695,7 @@ function PlushSearch2() $phraseArray = $matches[2]; // Remove the phrase parts and extract the words. - $wordArray = preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~' . ($context['utf8'] ? 'u' : ''), ' ', $search_params['search']); + $wordArray = preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~u', ' ', $search_params['search']); $wordArray = explode(' ', $smcFunc['htmlspecialchars'](un_htmlspecialchars($wordArray), ENT_QUOTES)); @@ -2092,9 +2092,9 @@ function prepareSearchContext($reset = false) $message['body'] = un_htmlspecialchars(strtr($message['body'], array(' ' => ' ', '
' => "\n", '[' => '[', ']' => ']', ':' => ':', '@' => '@'))); if (empty($modSettings['search_method']) || $force_partial_word) - preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?|^)(' . $matchString . ')(.{0,' . $charLimit . '}[\s\W]|[^\s\W]{0,' . $charLimit . '})/is' . ($context['utf8'] ? 'u' : ''), $message['body'], $matches); + preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?|^)(' . $matchString . ')(.{0,' . $charLimit . '}[\s\W]|[^\s\W]{0,' . $charLimit . '})/isu', $message['body'], $matches); else - preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?[\s\W]|^)(' . $matchString . ')([\s\W].{0,' . $charLimit . '}[\s\W]|[\s\W][^\s\W]{0,' . $charLimit . '})/is' . ($context['utf8'] ? 'u' : ''), $message['body'], $matches); + preg_match_all('/([^\s\W]{' . $charLimit . '}[\s\W]|[\s\W].{0,' . $charLimit . '}?[\s\W]|^)(' . $matchString . ')([\s\W].{0,' . $charLimit . '}[\s\W]|[\s\W][^\s\W]{0,' . $charLimit . '})/isu', $message['body'], $matches); $message['body'] = ''; foreach ($matches[0] as $index => $match) From 8438e2bbc70fe40595851f51c517459286ccf4a0 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:47:30 -0800 Subject: [PATCH 12/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- SSI.php | 2 +- Sources/Drafts.php | 2 +- Sources/Load.php | 2 +- Sources/ShowAttachments.php | 4 +- Sources/Subs.php | 12 +-- Themes/default/scripts/script.js | 139 ++++--------------------------- 6 files changed, 26 insertions(+), 135 deletions(-) diff --git a/SSI.php b/SSI.php index f72bcad299..e823fc54de 100644 --- a/SSI.php +++ b/SSI.php @@ -203,7 +203,7 @@ // @todo: probably not the best place, but somewhere it should be set... if (!headers_sent()) - header('content-type: text/html; charset=' . (empty($modSettings['global_character_set']) ? (empty($txt['lang_character_set']) ? 'ISO-8859-1' : $txt['lang_character_set']) : $modSettings['global_character_set'])); + header('content-type: text/html; charset=UTF-8'); // Take care of any banning that needs to be done. if (isset($_REQUEST['ssi_ban']) || (isset($ssi_ban) && $ssi_ban === true)) diff --git a/Sources/Drafts.php b/Sources/Drafts.php index 736d2a8fc4..348ab9b940 100644 --- a/Sources/Drafts.php +++ b/Sources/Drafts.php @@ -495,7 +495,7 @@ function XmlDraft($id_draft) { global $txt, $context; - header('content-type: text/xml; charset=' . (empty($context['character_set']) ? 'ISO-8859-1' : $context['character_set'])); + header('content-type: text/xml; charset=UTF-8'); echo ' diff --git a/Sources/Load.php b/Sources/Load.php index 392964608a..0a8e67c21b 100644 --- a/Sources/Load.php +++ b/Sources/Load.php @@ -3457,7 +3457,7 @@ function template_include($filename, $once = false) ob_start(); if (isset($_GET['debug'])) - header('content-type: application/xhtml+xml; charset=' . (empty($context['character_set']) ? 'ISO-8859-1' : $context['character_set'])); + header('content-type: application/xhtml+xml; charset=UTF-8'); // Don't cache error pages!! header('expires: Mon, 26 Jul 1997 05:00:00 GMT'); diff --git a/Sources/ShowAttachments.php b/Sources/ShowAttachments.php index 18e808a003..4f245345b4 100644 --- a/Sources/ShowAttachments.php +++ b/Sources/ShowAttachments.php @@ -29,7 +29,7 @@ function showAttachment() global $smcFunc, $modSettings, $maintenance, $context, $txt, $user_info; // Some defaults that we need. - $context['character_set'] = empty($modSettings['global_character_set']) ? (empty($txt['lang_character_set']) ? 'ISO-8859-1' : $txt['lang_character_set']) : $modSettings['global_character_set']; + $context['character_set'] = empty($modSettings['global_character_set']) ? (empty($txt['lang_character_set']) ? 'UTF-8' : $txt['lang_character_set']) : $modSettings['global_character_set']; $context['utf8'] = $context['character_set'] === 'UTF-8'; // An early hook to set up global vars, clean cache and other early process. @@ -218,7 +218,7 @@ function showAttachment() if (!file_exists($file['filePath'])) { send_http_status(404); - header('content-type: text/plain; charset=' . (empty($context['character_set']) ? 'ISO-8859-1' : $context['character_set'])); + header('content-type: text/plain; charset=UTF-8'); // We need to die like this *before* we send any anti-caching headers as below. die('File not found.'); diff --git a/Sources/Subs.php b/Sources/Subs.php index 9f48f83770..3e8183fae0 100644 --- a/Sources/Subs.php +++ b/Sources/Subs.php @@ -1237,13 +1237,7 @@ function un_htmlspecialchars($string) static $translation = array(); // Determine the character set... Default to UTF-8 - if (empty($context['character_set'])) - $charset = 'UTF-8'; - // Use ISO-8859-1 in place of non-supported ISO-8859 charsets... - elseif (strpos($context['character_set'], 'ISO-8859-') !== false && !in_array($context['character_set'], array('ISO-8859-5', 'ISO-8859-15'))) - $charset = 'ISO-8859-1'; - else - $charset = $context['character_set']; + $charset = 'UTF-8'; if (empty($translation)) $translation = array_flip(get_html_translation_table(HTML_SPECIALCHARS, ENT_QUOTES, $charset)) + array(''' => '\'', ''' => '\'', ' ' => ' '); @@ -4505,10 +4499,10 @@ function template_header() if (!isset($_REQUEST['xml']) && isset($_GET['debug']) && !isBrowser('ie')) header('content-type: application/xhtml+xml'); elseif (!isset($_REQUEST['xml'])) - header('content-type: text/html; charset=' . (empty($context['character_set']) ? 'ISO-8859-1' : $context['character_set'])); + header('content-type: text/html; charset=UTF-8'); } - header('content-type: text/' . (isset($_REQUEST['xml']) ? 'xml' : 'html') . '; charset=' . (empty($context['character_set']) ? 'ISO-8859-1' : $context['character_set'])); + header('content-type: text/' . (isset($_REQUEST['xml']) ? 'xml' : 'html') . '; charset=UTF-8'); // We need to splice this in after the body layer, or after the main layer for older stuff. if ($context['in_maintenance'] && $context['user']['is_admin']) diff --git a/Themes/default/scripts/script.js b/Themes/default/scripts/script.js index d5bdb12a88..34017ab7ef 100644 --- a/Themes/default/scripts/script.js +++ b/Themes/default/scripts/script.js @@ -122,131 +122,28 @@ String.prototype.oCharsetConversion = { // Convert a string to an 8 bit representation (like in PHP). String.prototype.php_to8bit = function () { - if (smf_charset == 'UTF-8') - { - var n, sReturn = ''; - - // Recode from UTF16 (native .js) to UTF8 - for (var i = 0, iTextLen = this.length; i < iTextLen; i++) - { - // Below xFFFF, UTF16 simply = the code points - n = this.charCodeAt(i); - if (n < 128) - sReturn += String.fromCharCode(n); - else if (n < 2048) - sReturn += String.fromCharCode(192 | n >> 6) + String.fromCharCode(128 | n & 63); - // 0xD800 - 0xDBFF - else if (n >= 55296 && n <= 56319) - { - // In this range, this is the beginning of a surrogate pair, where 4-byte utf8 chars are - n = 65536 + ((n & 1023) << 10) + (this.charCodeAt(i + 1) & 1023); - sReturn += String.fromCharCode(240 | n >> 18) + String.fromCharCode(128 | n >> 12 & 63) + String.fromCharCode(128 | n >> 6 & 63) + String.fromCharCode(128 | n & 63); - // Skip next char, already used... - i++; - } - else - sReturn += String.fromCharCode(224 | n >> 12) + String.fromCharCode(128 | n >> 6 & 63) + String.fromCharCode(128 | n & 63); - } + var n, sReturn = ''; - return sReturn; - } - - else if (this.oCharsetConversion.from.length == 0) + // Recode from UTF16 (native .js) to UTF8 + for (var i = 0, iTextLen = this.length; i < iTextLen; i++) { - switch (smf_charset) + // Below xFFFF, UTF16 simply = the code points + n = this.charCodeAt(i); + if (n < 128) + sReturn += String.fromCharCode(n); + else if (n < 2048) + sReturn += String.fromCharCode(192 | n >> 6) + String.fromCharCode(128 | n & 63); + // 0xD800 - 0xDBFF + else if (n >= 55296 && n <= 56319) { - case 'ISO-8859-1': - this.oCharsetConversion = { - from: '\xa0-\xff', - to: '\xa0-\xff' - }; - break; - - case 'ISO-8859-2': - this.oCharsetConversion = { - from: '\xa0\u0104\u02d8\u0141\xa4\u013d\u015a\xa7\xa8\u0160\u015e\u0164\u0179\xad\u017d\u017b\xb0\u0105\u02db\u0142\xb4\u013e\u015b\u02c7\xb8\u0161\u015f\u0165\u017a\u02dd\u017e\u017c\u0154\xc1\xc2\u0102\xc4\u0139\u0106\xc7\u010c\xc9\u0118\xcb\u011a\xcd\xce\u010e\u0110\u0143\u0147\xd3\xd4\u0150\xd6\xd7\u0158\u016e\xda\u0170\xdc\xdd\u0162\xdf\u0155\xe1\xe2\u0103\xe4\u013a\u0107\xe7\u010d\xe9\u0119\xeb\u011b\xed\xee\u010f\u0111\u0144\u0148\xf3\xf4\u0151\xf6\xf7\u0159\u016f\xfa\u0171\xfc\xfd\u0163\u02d9', - to: '\xa0-\xff' - }; - break; - - case 'ISO-8859-5': - this.oCharsetConversion = { - from: '\xa0\u0401-\u040c\xad\u040e-\u044f\u2116\u0451-\u045c\xa7\u045e\u045f', - to: '\xa0-\xff' - }; - break; - - case 'ISO-8859-9': - this.oCharsetConversion = { - from: '\xa0-\xcf\u011e\xd1-\xdc\u0130\u015e\xdf-\xef\u011f\xf1-\xfc\u0131\u015f\xff', - to: '\xa0-\xff' - }; - break; - - case 'ISO-8859-15': - this.oCharsetConversion = { - from: '\xa0-\xa3\u20ac\xa5\u0160\xa7\u0161\xa9-\xb3\u017d\xb5-\xb7\u017e\xb9-\xbb\u0152\u0153\u0178\xbf-\xff', - to: '\xa0-\xff' - }; - break; - - case 'tis-620': - this.oCharsetConversion = { - from: '\u20ac\u2026\u2018\u2019\u201c\u201d\u2022\u2013\u2014\xa0\u0e01-\u0e3a\u0e3f-\u0e5b', - to: '\x80\x85\x91-\x97\xa0-\xda\xdf-\xfb' - }; - break; - - case 'windows-1251': - this.oCharsetConversion = { - from: '\u0402\u0403\u201a\u0453\u201e\u2026\u2020\u2021\u20ac\u2030\u0409\u2039\u040a\u040c\u040b\u040f\u0452\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u2122\u0459\u203a\u045a\u045c\u045b\u045f\xa0\u040e\u045e\u0408\xa4\u0490\xa6\xa7\u0401\xa9\u0404\xab-\xae\u0407\xb0\xb1\u0406\u0456\u0491\xb5-\xb7\u0451\u2116\u0454\xbb\u0458\u0405\u0455\u0457\u0410-\u044f', - to: '\x80-\x97\x99-\xff' - }; - break; - - case 'windows-1253': - this.oCharsetConversion = { - from: '\u20ac\u201a\u0192\u201e\u2026\u2020\u2021\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u2122\u203a\xa0\u0385\u0386\xa3-\xa9\xab-\xae\u2015\xb0-\xb3\u0384\xb5-\xb7\u0388-\u038a\xbb\u038c\xbd\u038e-\u03a1\u03a3-\u03ce', - to: '\x80\x82-\x87\x89\x8b\x91-\x97\x99\x9b\xa0-\xa9\xab-\xd1\xd3-\xfe' - }; - break; - - case 'windows-1255': - this.oCharsetConversion = { - from: '\u20ac\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u203a\xa0-\xa3\u20aa\xa5-\xa9\xd7\xab-\xb9\xf7\xbb-\xbf\u05b0-\u05b9\u05bb-\u05c3\u05f0-\u05f4\u05d0-\u05ea\u200e\u200f', - to: '\x80\x82-\x89\x8b\x91-\x99\x9b\xa0-\xc9\xcb-\xd8\xe0-\xfa\xfd\xfe' - }; - break; - - case 'windows-1256': - this.oCharsetConversion = { - from: '\u20ac\u067e\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0679\u2039\u0152\u0686\u0698\u0688\u06af\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u06a9\u2122\u0691\u203a\u0153\u200c\u200d\u06ba\xa0\u060c\xa2-\xa9\u06be\xab-\xb9\u061b\xbb-\xbe\u061f\u06c1\u0621-\u0636\xd7\u0637-\u063a\u0640-\u0643\xe0\u0644\xe2\u0645-\u0648\xe7-\xeb\u0649\u064a\xee\xef\u064b-\u064e\xf4\u064f\u0650\xf7\u0651\xf9\u0652\xfb\xfc\u200e\u200f\u06d2', - to: '\x80-\xff' - }; - break; - - default: - this.oCharsetConversion = { - from: '', - to: '' - }; - break; + // In this range, this is the beginning of a surrogate pair, where 4-byte utf8 chars are + n = 65536 + ((n & 1023) << 10) + (this.charCodeAt(i + 1) & 1023); + sReturn += String.fromCharCode(240 | n >> 18) + String.fromCharCode(128 | n >> 12 & 63) + String.fromCharCode(128 | n >> 6 & 63) + String.fromCharCode(128 | n & 63); + // Skip next char, already used... + i++; } - var funcExpandString = function (sSearch) { - var sInsert = ''; - for (var i = sSearch.charCodeAt(0), n = sSearch.charCodeAt(2); i <= n; i++) - sInsert += String.fromCharCode(i); - return sInsert; - }; - this.oCharsetConversion.from = this.oCharsetConversion.from.replace(/.\-./g, funcExpandString); - this.oCharsetConversion.to = this.oCharsetConversion.to.replace(/.\-./g, funcExpandString); - } - - var sReturn = '', iOffsetFrom = 0; - for (var i = 0, n = this.length; i < n; i++) - { - iOffsetFrom = this.oCharsetConversion.from.indexOf(this.charAt(i)); - sReturn += iOffsetFrom > -1 ? this.oCharsetConversion.to.charAt(iOffsetFrom) : (this.charCodeAt(i) > 127 ? '&#' + this.charCodeAt(i) + ';' : this.charAt(i)); + else + sReturn += String.fromCharCode(224 | n >> 12) + String.fromCharCode(128 | n >> 6 & 63) + String.fromCharCode(128 | n & 63); } return sReturn From 4af44dc73e9e1874ee37e0f7ad35a5ad1b183b78 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 12 Dec 2020 21:49:55 -0800 Subject: [PATCH 13/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- subscriptions.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subscriptions.php b/subscriptions.php index 0a022bc1cb..09bbb81cf9 100644 --- a/subscriptions.php +++ b/subscriptions.php @@ -37,7 +37,7 @@ // If there's literally nothing coming in, let's take flight! if (empty($_POST)) { - header('content-type: text/html; charset=' . (empty($modSettings['global_character_set']) ? (empty($txt['lang_character_set']) ? 'ISO-8859-1' : $txt['lang_character_set']) : $modSettings['global_character_set'])); + header('content-type: text/html; charset=UTF-8'); die($txt['paid_no_data']); } From 26b7ad8fb87a4b135124714d3a27ec875cc1da7a Mon Sep 17 00:00:00 2001 From: sbulen Date: Sun, 13 Dec 2020 19:42:01 -0800 Subject: [PATCH 14/26] Too small; make consistent with other calls Signed by Shawn Bulen, bulens@pacbell.net --- Sources/ManageMaintenance.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/ManageMaintenance.php b/Sources/ManageMaintenance.php index e5d2b24376..3a2ded3403 100644 --- a/Sources/ManageMaintenance.php +++ b/Sources/ManageMaintenance.php @@ -843,7 +843,7 @@ function ConvertEntities() if ($column_name !== $primary_key && strpos($column_value, '&#') !== false) { $changes[] = $column_name . ' = {string:changes_' . $column_name . '}'; - $insertion_variables['changes_' . $column_name] = preg_replace_callback('~&#(\d{1,5}|x[0-9a-fA-F]{1,4});~', 'fixchardb__callback', $column_value); + $insertion_variables['changes_' . $column_name] = preg_replace_callback('~&#(\d{1,7}|x[0-9a-fA-F]{1,6});~', 'fixchardb__callback', $column_value); } $where = array(); From 398739ac27ad9c73c10457bfda96dc42190f123a Mon Sep 17 00:00:00 2001 From: sbulen Date: Thu, 23 Sep 2021 18:42:21 -0700 Subject: [PATCH 15/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Subs.php b/Sources/Subs.php index 3e8183fae0..0697c917d0 100644 --- a/Sources/Subs.php +++ b/Sources/Subs.php @@ -3077,7 +3077,7 @@ function($a, $b) } $tmp_data = preg_replace_callback( - '~' . $url_regex . '~i' . ($context['utf8'] ? 'u' : ''), + '~' . $url_regex . '~iu', function($matches) use ($schemes) { $url = array_shift($matches); @@ -3144,7 +3144,7 @@ function($matches) use ($schemes) // Followed by a non-domain character or end of line '(?=[^' . $domain_label_chars . ']|$)'; - $tmp_data = preg_replace('~' . $email_regex . '~i' . ($context['utf8'] ? 'u' : ''), '[email]$0[/email]', $data); + $tmp_data = preg_replace('~' . $email_regex . '~iu', '[email]$0[/email]', $data); if (!is_null($tmp_data)) $data = $tmp_data; From d10533d1fe9cdbe2dbd841f70755635beda35d03 Mon Sep 17 00:00:00 2001 From: sbulen Date: Thu, 23 Sep 2021 18:54:14 -0700 Subject: [PATCH 16/26] Use unicode Signed by Shawn Bulen, bulens@pacbell.net --- Sources/DbPackages-mysql.php | 2 +- Sources/ManageMaintenance.php | 4 ++-- other/install.php | 4 ++-- other/upgrade.php | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Sources/DbPackages-mysql.php b/Sources/DbPackages-mysql.php index 67151a6861..18099cbdbb 100644 --- a/Sources/DbPackages-mysql.php +++ b/Sources/DbPackages-mysql.php @@ -200,7 +200,7 @@ function smf_db_create_table($table_name, $columns, $indexes = array(), $paramet if (!empty($db_character_set) && $db_character_set == 'utf8') $table_query .= ' DEFAULT CHARSET=utf8 COLLATE=utf8_general_ci'; else - $table_query .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; + $table_query .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci'; // Create the table! $smcFunc['db_query']('', $table_query, diff --git a/Sources/ManageMaintenance.php b/Sources/ManageMaintenance.php index 3a2ded3403..a61b8ca7ef 100644 --- a/Sources/ManageMaintenance.php +++ b/Sources/ManageMaintenance.php @@ -474,7 +474,7 @@ function ConvertMsgBody() } /** - * Converts all text columns from utf8_general_ci to utf8mb4_general_ci. + * Converts all text columns from utf8_general_ci to utf8mb4_unicode_ci. * Assumption: This forum has undergone a UTF8 conversion. * * This action is linked from the maintenance screen (if applicable). @@ -619,7 +619,7 @@ function ConvertUtf8mb4() foreach ($db_tables AS $cur_table) { $request = $smcFunc['db_query']('', ' - ALTER TABLE ' . $cur_table . ' CONVERT TO CHARACTER SET \'utf8mb4\' COLLATE \'utf8mb4_general_ci\'', + ALTER TABLE ' . $cur_table . ' CONVERT TO CHARACTER SET \'utf8mb4\' COLLATE \'utf8mb4_unicode_ci\'', array( ) ); diff --git a/other/install.php b/other/install.php index 5fdf1ca86a..d1dd3d91b7 100644 --- a/other/install.php +++ b/other/install.php @@ -1211,8 +1211,8 @@ function DatabasePopulation() $replaces['{$memory}'] = (!$has_innodb && in_array('MEMORY', $engines)) ? 'MEMORY' : $replaces['{$engine}']; // UTF-8 is required. - $replaces['{$engine}'] .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; - $replaces['{$memory}'] .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; + $replaces['{$engine}'] .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci'; + $replaces['{$memory}'] .= ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci'; // One last thing - if we don't have InnoDB, we can't do transactions... if (!$has_innodb) diff --git a/other/upgrade.php b/other/upgrade.php index 51fa4c011c..c68a65bd0b 100644 --- a/other/upgrade.php +++ b/other/upgrade.php @@ -2043,7 +2043,7 @@ function($errno, $errstr, $errfile, $errline) use ($support_js) // If we're on MySQL, set {db_collation}; this approach is used throughout upgrade_2-0_mysql.php to set new tables to utf8 // Note it is expected to be in the format: ENGINE=MyISAM{$db_collation}; if ($db_type == 'mysql') - $db_collation = ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci'; + $db_collation = ' DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci'; else $db_collation = ''; @@ -2057,7 +2057,7 @@ function($errno, $errstr, $errfile, $errline) use ($support_js) $last_step = ''; // Make sure all newly created tables will have the proper characters set; this approach is used throughout upgrade_2-1_mysql.php - $lines = str_replace(') ENGINE=MyISAM;', ') ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;', $lines); + $lines = str_replace(') ENGINE=MyISAM;', ') ENGINE=MyISAM DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;', $lines); // Count the total number of steps within this file - for progress. $file_steps = substr_count(implode('', $lines), '---#'); From 0adbe34483a4f7ce5dd2c27296a02495b46cd782 Mon Sep 17 00:00:00 2001 From: sbulen Date: Thu, 23 Sep 2021 19:21:19 -0700 Subject: [PATCH 17/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Load.php | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Sources/Load.php b/Sources/Load.php index 0a8e67c21b..928a84e0c2 100644 --- a/Sources/Load.php +++ b/Sources/Load.php @@ -272,15 +272,12 @@ function reloadSettings() return random_bytes($length); }, - 'normalize' => function($string, $form = 'c') use ($utf8) + 'normalize' => function($string, $form = 'c') { global $sourcedir; $string = (string) $string; - if (!$utf8) - return $string; - require_once($sourcedir . '/Subs-Charset.php'); $normalize_func = 'utf8_normalize_' . strtolower((string) $form); From b588dc87be3153b218a496598fa714461a5bbf1a Mon Sep 17 00:00:00 2001 From: sbulen Date: Thu, 23 Sep 2021 19:30:14 -0700 Subject: [PATCH 18/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Subs.php b/Sources/Subs.php index 0697c917d0..5df0dc561f 100644 --- a/Sources/Subs.php +++ b/Sources/Subs.php @@ -2729,14 +2729,14 @@ function($a, $b) // Some reusable character classes $excluded_trailing_chars = '!;:.,?'; - $domain_label_chars = '0-9A-Za-z\-' . ($context['utf8'] ? implode('', array( + $domain_label_chars = '0-9A-Za-z\-' . implode('', array( '\x{A0}-\x{D7FF}', '\x{F900}-\x{FDCF}', '\x{FDF0}-\x{FFEF}', '\x{10000}-\x{1FFFD}', '\x{20000}-\x{2FFFD}', '\x{30000}-\x{3FFFD}', '\x{40000}-\x{4FFFD}', '\x{50000}-\x{5FFFD}', '\x{60000}-\x{6FFFD}', '\x{70000}-\x{7FFFD}', '\x{80000}-\x{8FFFD}', '\x{90000}-\x{9FFFD}', '\x{A0000}-\x{AFFFD}', '\x{B0000}-\x{BFFFD}', '\x{C0000}-\x{CFFFD}', '\x{D0000}-\x{DFFFD}', '\x{E1000}-\x{EFFFD}', - )) : ''); + )); // Parse any URLs if (!isset($disabled['url']) && strpos($data, '[url') === false) From ed6bb671e4f7164a78aef5b86aaa848aa51b5ea9 Mon Sep 17 00:00:00 2001 From: sbulen Date: Fri, 24 Sep 2021 15:52:19 -0700 Subject: [PATCH 19/26] Must also clean up old alt index names Signed by Shawn Bulen, bulens@pacbell.net --- other/upgrade_2-1_mysql.sql | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/other/upgrade_2-1_mysql.sql b/other/upgrade_2-1_mysql.sql index 44b4f4a727..9c1f771084 100644 --- a/other/upgrade_2-1_mysql.sql +++ b/other/upgrade_2-1_mysql.sql @@ -3818,32 +3818,42 @@ foreach($files AS $filename) /******************************************************************************/ --- Prepare indexes for mb4 /******************************************************************************/ ----# real_name column drop +---# Real_name column drop ALTER TABLE {$db_prefix}members DROP INDEX idx_real_name; ---# ----# real_name column recreate +---# Real_name column drop - old alt name +ALTER TABLE {$db_prefix}members +DROP INDEX real_name; +---# + +---# Real_name column recreate ALTER TABLE {$db_prefix}members ADD INDEX idx_real_name (real_name(191)); ---# ----# email column drop +---# Email column drop ALTER TABLE {$db_prefix}members DROP INDEX idx_email_address; ---# ----# email column recreate +---# Email column drop - old alt name +ALTER TABLE {$db_prefix}members +DROP INDEX email_address; +---# + +---# Email column recreate ALTER TABLE {$db_prefix}members ADD INDEX idx_email_address (email_address(191)); ---# ----# lngfile column drop +---# Lngfile column drop ALTER TABLE {$db_prefix}qanda DROP INDEX idx_lngfile; ---# ----# lngfile column recreate +---# Lngfile column recreate ALTER TABLE {$db_prefix}qanda ADD INDEX idx_lngfile (lngfile(191)); ---# From 26b310adbe64a8dab5de7a51032b9a1ffd4cb9f7 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 25 Sep 2021 12:40:32 -0700 Subject: [PATCH 20/26] Use unicode Signed by Shawn Bulen, bulens@pacbell.net --- other/upgrade.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/other/upgrade.php b/other/upgrade.php index c68a65bd0b..1442fb25c7 100644 --- a/other/upgrade.php +++ b/other/upgrade.php @@ -3253,7 +3253,7 @@ function ConvertUtf8() $smcFunc['db_query']('', ' ALTER TABLE {raw:table_name} - CONVERT TO CHARACTER SET utf8mb4', + CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci', array( 'table_name' => $table_info['Name'], ) From 9ac9de45e8280f9a51b2dfc6aad33d545a31416c Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 25 Sep 2021 13:28:06 -0700 Subject: [PATCH 21/26] Installer & upgrader to utf8mb4 Signed by Shawn Bulen, bulens@pacbell.net --- other/install.php | 5 ++++- other/upgrade.php | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/other/install.php b/other/install.php index d1dd3d91b7..67641e943a 100644 --- a/other/install.php +++ b/other/install.php @@ -1101,7 +1101,10 @@ function ForumSettings() } // Set the character set here. - installer_updateSettingsFile(array('db_character_set' => 'utf8'), true); + if ($db_type === 'postgresql') + installer_updateSettingsFile(array('db_character_set' => 'utf8'), true); + else + installer_updateSettingsFile(array('db_character_set' => 'utf8mb4'), true); // Good, skip on. return true; diff --git a/other/upgrade.php b/other/upgrade.php index 1442fb25c7..b595f63b95 100644 --- a/other/upgrade.php +++ b/other/upgrade.php @@ -2811,7 +2811,7 @@ function ConvertUtf8() return true; } // First make sure they aren't already on UTF-8 before we go anywhere... - if ($db_type == 'postgresql' || ($db_character_set === 'utf8' && !empty($modSettings['global_character_set']) && $modSettings['global_character_set'] === 'UTF-8')) + if ($db_type == 'postgresql' || $db_character_set === 'utf8mb4') { $smcFunc['db_insert']('replace', '{db_prefix}settings', @@ -3280,7 +3280,7 @@ function ConvertUtf8() // Hopefully this works... require_once($sourcedir . '/Subs.php'); require_once($sourcedir . '/Subs-Admin.php'); - updateSettingsFile(array('db_character_set' => 'utf8')); + updateSettingsFile(array('db_character_set' => 'utf8mb4')); updateSettingsFile(array('db_mb4' => true)); // The conversion might have messed up some serialized strings. Fix them! From 056f035059364c46ef401db8a05620e66b564d82 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 8 Jul 2023 21:38:14 -0700 Subject: [PATCH 22/26] Preserve legacy setting in case used by mods Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Load.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Sources/Load.php b/Sources/Load.php index 6f35abbf03..0c7adf6d77 100644 --- a/Sources/Load.php +++ b/Sources/Load.php @@ -101,6 +101,9 @@ function reloadSettings() if (empty($modSettings['force_ssl'])) $image_proxy_enabled = false; + // Preserve legacy utf8 variable in case used by mods + $context['utf8'] = 'UTF-8'; + // Set a list of common functions. $ent_list = '&(?:#' . (empty($modSettings['disableEntityCheck']) ? '\d{1,7}' : '021') . '|quot|amp|lt|gt|nbsp);'; $ent_check = empty($modSettings['disableEntityCheck']) ? function($string) From 13a62712a98a7b77d2dbc5a1274d332656322263 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sat, 8 Jul 2023 22:24:03 -0700 Subject: [PATCH 23/26] Preserve legacy settings in case used by mods Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Load.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Load.php b/Sources/Load.php index 0c7adf6d77..3eb4b4d5eb 100644 --- a/Sources/Load.php +++ b/Sources/Load.php @@ -101,8 +101,8 @@ function reloadSettings() if (empty($modSettings['force_ssl'])) $image_proxy_enabled = false; - // Preserve legacy utf8 variable in case used by mods - $context['utf8'] = 'UTF-8'; + // Preserve legacy utf8 variables in case used by mods + $context['utf8'] = $utf8 = 'UTF-8'; // Set a list of common functions. $ent_list = '&(?:#' . (empty($modSettings['disableEntityCheck']) ? '\d{1,7}' : '021') . '|quot|amp|lt|gt|nbsp);'; From 583f69f51395626057657dd2a9133875a3112093 Mon Sep 17 00:00:00 2001 From: sbulen Date: Tue, 11 Jul 2023 20:59:07 -0700 Subject: [PATCH 24/26] Only utf8 going forward Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Subs.php b/Sources/Subs.php index e78ad6c50b..3dc73d6052 100644 --- a/Sources/Subs.php +++ b/Sources/Subs.php @@ -5274,7 +5274,7 @@ function host_from_ip($ip) */ function text2words($text, $max_chars = 20, $encrypt = false) { - global $smcFunc, $context; + global $smcFunc, $context, $db_character_set; // Upgrader uses this function & may be working on old DBs... if (!isset($db_character_set) || (isset($db_character_set) && ($db_character_set != 'utf8') && ($db_character_set != 'utf8mb4'))) @@ -5314,7 +5314,7 @@ function text2words($text, $max_chars = 20, $encrypt = false) $returned_words = array(); foreach ($words as $word) if (($word = trim($word, '-_\'')) !== '') - $returned_words[] = $max_chars === null ? $word : substr($word, 0, $max_chars); + $returned_words[] = $max_chars === null ? $word : $smcFunc['substr']($word, 0, $max_chars); // Filter out all words that occur more than once. return array_unique($returned_words); From a9bac5130f0c8427e327d2bee0b147f3fd48578b Mon Sep 17 00:00:00 2001 From: sbulen Date: Wed, 12 Jul 2023 20:28:47 -0700 Subject: [PATCH 25/26] More mb4 index changes Signed by Shawn Bulen, bulens@pacbell.net --- Sources/ManageMaintenance.php | 70 +++++++++++++++++++++++++++++++---- other/upgrade_2-1_mysql.sql | 2 +- 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/Sources/ManageMaintenance.php b/Sources/ManageMaintenance.php index 23a62c48c7..5c76cd7fc5 100644 --- a/Sources/ManageMaintenance.php +++ b/Sources/ManageMaintenance.php @@ -523,13 +523,13 @@ function ConvertUtf8mb4() $db_tables[] = $row['TABLE_NAME']; $smcFunc['db_free_result']($request); - // Check each of the three indexes that may need updating - #1 + // Check each of the four indexes that may need updating - #1 $request = $smcFunc['db_query']('', ' SELECT SUB_PART FROM information_schema.STATISTICS WHERE TABLE_NAME = {string:cur_table} AND TABLE_SCHEMA = {string:cur_schema} - AND INDEX_NAME = \'idx_real_name\' + AND INDEX_NAME IN (\'idx_real_name\', \'real_name\') AND COLUMN_NAME = \'real_name\'', array( 'cur_schema' => $db_name, @@ -545,13 +545,35 @@ function ConvertUtf8mb4() $smcFunc['db_free_result']($request); - // Check each of the three indexes that may need updating - #2 + // Check each of the four indexes that may need updating - #2 $request = $smcFunc['db_query']('', ' SELECT SUB_PART FROM information_schema.STATISTICS WHERE TABLE_NAME = {string:cur_table} AND TABLE_SCHEMA = {string:cur_schema} - AND INDEX_NAME = \'idx_email_address\' + AND INDEX_NAME = \'idx_active_real_name\' + AND COLUMN_NAME = \'real_name\'', + array( + 'cur_schema' => $db_name, + 'cur_table' => $db_prefix . 'members', + ) + ); + + list($fix_active_real_name) = $smcFunc['db_fetch_row']($request); + if ($fix_active_real_name == null) + $fix_active_real_name = true; + else + $fix_active_real_name = false; + + $smcFunc['db_free_result']($request); + + // Check each of the four indexes that may need updating - #3 + $request = $smcFunc['db_query']('', ' + SELECT SUB_PART + FROM information_schema.STATISTICS + WHERE TABLE_NAME = {string:cur_table} + AND TABLE_SCHEMA = {string:cur_schema} + AND INDEX_NAME IN (\'idx_email_address\', \'email_address\') AND COLUMN_NAME = \'email_address\'', array( 'cur_schema' => $db_name, @@ -567,7 +589,7 @@ function ConvertUtf8mb4() $smcFunc['db_free_result']($request); - // Check each of the three indexes that may need updating - #3 + // Check each of the four indexes that may need updating - #4 $request = $smcFunc['db_query']('', ' SELECT SUB_PART FROM information_schema.STATISTICS @@ -592,12 +614,29 @@ function ConvertUtf8mb4() // After this point we are starting the conversion. But first: session check. checkSession(); - // First, drop the three indexes if they need fixing... + // First, drop the four indexes (using new & old names) if they need fixing... if ($fix_real_name) $request = $smcFunc['db_query']('', ' ALTER TABLE {db_prefix}members DROP INDEX idx_real_name', array( + 'db_error_skip' => true, + ) + ); + if ($fix_real_name) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + DROP INDEX real_name', + array( + 'db_error_skip' => true, + ) + ); + if ($fix_active_real_name) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + DROP INDEX idx_active_real_name', + array( + 'db_error_skip' => true, ) ); if ($fix_email_address) @@ -605,6 +644,15 @@ function ConvertUtf8mb4() ALTER TABLE {db_prefix}members DROP INDEX idx_email_address', array( + 'db_error_skip' => true, + ) + ); + if ($fix_email_address) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + DROP INDEX email_address', + array( + 'db_error_skip' => true, ) ); if ($fix_lngfile) @@ -612,6 +660,7 @@ function ConvertUtf8mb4() ALTER TABLE {db_prefix}qanda DROP INDEX idx_lngfile', array( + 'db_error_skip' => true, ) ); @@ -625,7 +674,7 @@ function ConvertUtf8mb4() ); } - // Next, fix the three indexes... + // Next, fix the four indexes... if ($fix_real_name) $request = $smcFunc['db_query']('', ' ALTER TABLE {db_prefix}members @@ -633,6 +682,13 @@ function ConvertUtf8mb4() array( ) ); + if ($fix_active_real_name) + $request = $smcFunc['db_query']('', ' + ALTER TABLE {db_prefix}members + ADD INDEX idx_active_real_name (is_activated, real_name(191))', + array( + ) + ); if ($fix_email_address) $request = $smcFunc['db_query']('', ' ALTER TABLE {db_prefix}members diff --git a/other/upgrade_2-1_mysql.sql b/other/upgrade_2-1_mysql.sql index 595b06360d..2a0889b99b 100644 --- a/other/upgrade_2-1_mysql.sql +++ b/other/upgrade_2-1_mysql.sql @@ -3197,7 +3197,7 @@ DROP INDEX idx_active_real_name; ---# Updating members active_real_name (add) ALTER TABLE {$db_prefix}members -ADD INDEX idx_active_real_name (is_activated, real_name); +ADD INDEX idx_active_real_name (is_activated, real_name(191)); ---# ---# Updating messages drop old ipIndex From 0250924f9cf59b6f952d9a0d541335ff1f3b76c2 Mon Sep 17 00:00:00 2001 From: sbulen Date: Sun, 16 Jul 2023 15:08:02 -0700 Subject: [PATCH 26/26] mb_substr required here for upgrades Signed by Shawn Bulen, bulens@pacbell.net --- Sources/Subs.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/Subs.php b/Sources/Subs.php index 3dc73d6052..843dbab39e 100644 --- a/Sources/Subs.php +++ b/Sources/Subs.php @@ -5314,7 +5314,7 @@ function text2words($text, $max_chars = 20, $encrypt = false) $returned_words = array(); foreach ($words as $word) if (($word = trim($word, '-_\'')) !== '') - $returned_words[] = $max_chars === null ? $word : $smcFunc['substr']($word, 0, $max_chars); + $returned_words[] = $max_chars === null ? $word : mb_substr($word, 0, $max_chars); // Filter out all words that occur more than once. return array_unique($returned_words);