To: vim_dev@googlegroups.com Subject: Patch 8.2.0938 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 8.2.0938 Problem: NFA regexp uses tolower() to compare ignore-case. (Thayne McCombs) Solution: Use utf_fold() when possible. (ref. neovim #12456) Files: src/macros.h, src/diff.c, src/regexp_nfa.c, src/testdir/test_regexp_utf8.vim *** ../vim-8.2.0937/src/macros.h 2020-06-01 17:28:31.511939716 +0200 --- src/macros.h 2020-06-09 19:04:44.000093479 +0200 *************** *** 93,98 **** --- 93,99 ---- #define MB_ISUPPER(c) vim_isupper(c) #define MB_TOLOWER(c) vim_tolower(c) #define MB_TOUPPER(c) vim_toupper(c) + #define MB_CASEFOLD(c) (enc_utf8 ? utf_fold(c) : MB_TOLOWER(c)) // Use our own isdigit() replacement, because on MS-Windows isdigit() returns // non-zero for superscript 1. Also avoids that isdigit() crashes for numbers *** ../vim-8.2.0937/src/diff.c 2020-06-07 20:49:02.077891881 +0200 --- src/diff.c 2020-06-09 19:04:25.088157656 +0200 *************** *** 747,753 **** // xdiff doesn't support ignoring case, fold-case the text. c = PTR2CHAR(s); ! c = enc_utf8 ? utf_fold(c) : MB_TOLOWER(c); orig_len = mb_ptr2len(s); if (mb_char2bytes(c, cbuf) != orig_len) // TODO: handle byte length difference --- 747,753 ---- // xdiff doesn't support ignoring case, fold-case the text. c = PTR2CHAR(s); ! c = MB_CASEFOLD(c); orig_len = mb_ptr2len(s); if (mb_char2bytes(c, cbuf) != orig_len) // TODO: handle byte length difference *** ../vim-8.2.0937/src/regexp_nfa.c 2020-06-03 18:55:35.961570633 +0200 --- src/regexp_nfa.c 2020-06-09 19:26:59.135466405 +0200 *************** *** 5459,5465 **** { c1 = PTR2CHAR(match_text + len1); c2 = PTR2CHAR(rex.line + col + len2); ! if (c1 != c2 && (!rex.reg_ic || MB_TOLOWER(c1) != MB_TOLOWER(c2))) { match = FALSE; break; --- 5459,5465 ---- { c1 = PTR2CHAR(match_text + len1); c2 = PTR2CHAR(rex.line + col + len2); ! if (c1 != c2 && (!rex.reg_ic || MB_CASEFOLD(c1) != MB_CASEFOLD(c2))) { match = FALSE; break; *************** *** 6271,6281 **** } if (rex.reg_ic) { ! int curc_low = MB_TOLOWER(curc); int done = FALSE; for ( ; c1 <= c2; ++c1) ! if (MB_TOLOWER(c1) == curc_low) { result = result_if_matched; done = TRUE; --- 6271,6281 ---- } if (rex.reg_ic) { ! int curc_low = MB_CASEFOLD(curc); int done = FALSE; for ( ; c1 <= c2; ++c1) ! if (MB_CASEFOLD(c1) == curc_low) { result = result_if_matched; done = TRUE; *************** *** 6287,6294 **** } else if (state->c < 0 ? check_char_class(state->c, curc) : (curc == state->c ! || (rex.reg_ic && MB_TOLOWER(curc) ! == MB_TOLOWER(state->c)))) { result = result_if_matched; break; --- 6287,6294 ---- } else if (state->c < 0 ? check_char_class(state->c, curc) : (curc == state->c ! || (rex.reg_ic && MB_CASEFOLD(curc) ! == MB_CASEFOLD(state->c)))) { result = result_if_matched; break; *************** *** 6713,6719 **** result = (c == curc); if (!result && rex.reg_ic) ! result = MB_TOLOWER(c) == MB_TOLOWER(curc); // If rex.reg_icombine is not set only skip over the character // itself. When it is set skip over composing characters. if (result && enc_utf8 && !rex.reg_icombine) --- 6713,6719 ---- result = (c == curc); if (!result && rex.reg_ic) ! result = MB_CASEFOLD(c) == MB_CASEFOLD(curc); // If rex.reg_icombine is not set only skip over the character // itself. When it is set skip over composing characters. if (result && enc_utf8 && !rex.reg_icombine) *************** *** 6882,6888 **** // cheaper than adding a state that won't match. c = PTR2CHAR(rex.input + clen); if (c != prog->regstart && (!rex.reg_ic ! || MB_TOLOWER(c) != MB_TOLOWER(prog->regstart))) { #ifdef ENABLE_LOG fprintf(log_fd, " Skipping start state, regstart does not match\n"); --- 6882,6888 ---- // cheaper than adding a state that won't match. c = PTR2CHAR(rex.input + clen); if (c != prog->regstart && (!rex.reg_ic ! || MB_CASEFOLD(c) != MB_CASEFOLD(prog->regstart))) { #ifdef ENABLE_LOG fprintf(log_fd, " Skipping start state, regstart does not match\n"); *** ../vim-8.2.0937/src/testdir/test_regexp_utf8.vim 2019-12-16 22:43:22.328823387 +0100 --- src/testdir/test_regexp_utf8.vim 2020-06-09 19:28:03.727241430 +0200 *************** *** 355,358 **** --- 355,377 ---- set regexpengine& ambiwidth& endfunc + func Run_regexp_ignore_case() + call assert_equal('iIİ', substitute('iIİ', '\([iIİ]\)', '\1', 'g')) + + call assert_equal('iIx', substitute('iIİ', '\c\([İ]\)', 'x', 'g')) + call assert_equal('xxİ', substitute('iIİ', '\(i\c\)', 'x', 'g')) + call assert_equal('iIx', substitute('iIİ', '\(İ\c\)', 'x', 'g')) + call assert_equal('iIx', substitute('iIİ', '\c\(\%u0130\)', 'x', 'g')) + call assert_equal('iIx', substitute('iIİ', '\c\([\u0130]\)', 'x', 'g')) + call assert_equal('iIx', substitute('iIİ', '\c\([\u012f-\u0131]\)', 'x', 'g')) + endfunc + + func Test_regexp_ignore_case() + set regexpengine=1 + call Run_regexp_ignore_case() + set regexpengine=2 + call Run_regexp_ignore_case() + set regexpengine& + endfunc + " vim: shiftwidth=2 sts=2 expandtab *** ../vim-8.2.0937/src/version.c 2020-06-09 17:30:00.515654723 +0200 --- src/version.c 2020-06-09 19:30:58.914635149 +0200 *************** *** 756,757 **** --- 756,759 ---- { /* Add new patch number below this line */ + /**/ + 938, /**/ -- Not too long ago, a keyboard was something to make music with... /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ an exciting new programming language -- http://www.Zimbu.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///