diff --git a/changelog.d/15143.misc b/changelog.d/15143.misc new file mode 100644 index 0000000000..cff4518811 --- /dev/null +++ b/changelog.d/15143.misc @@ -0,0 +1 @@ +Fix a long-standing bug where the user directory search was not case-insensitive for accented characters. diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index c3f2b61bd5..f16a509ac4 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -14,6 +14,7 @@ import logging import re +import unicodedata from typing import ( TYPE_CHECKING, Iterable, @@ -490,6 +491,11 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore): values={"display_name": display_name, "avatar_url": avatar_url}, ) + # The display name that goes into the database index. + index_display_name = display_name + if index_display_name is not None: + index_display_name = _filter_text_for_index(index_display_name) + if isinstance(self.database_engine, PostgresEngine): # We weight the localpart most highly, then display name and finally # server name @@ -507,11 +513,15 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore): user_id, get_localpart_from_id(user_id), get_domain_from_id(user_id), - display_name, + index_display_name, ), ) elif isinstance(self.database_engine, Sqlite3Engine): - value = "%s %s" % (user_id, display_name) if display_name else user_id + value = ( + "%s %s" % (user_id, index_display_name) + if index_display_name + else user_id + ) self.db_pool.simple_upsert_txn( txn, table="user_directory_search", @@ -896,6 +906,41 @@ class UserDirectoryStore(UserDirectoryBackgroundUpdateStore): return {"limited": limited, "results": results[0:limit]} +def _filter_text_for_index(text: str) -> str: + """Transforms text before it is inserted into the user directory index, or searched + for in the user directory index. + + Note that the user directory search table needs to be rebuilt whenever this function + changes. + """ + # Lowercase the text, to make searches case-insensitive. + # This is necessary for both PostgreSQL and SQLite. PostgreSQL's + # `to_tsquery/to_tsvector` functions don't lowercase non-ASCII characters when using + # the "C" collation, while SQLite just doesn't lowercase non-ASCII characters at + # all. + text = text.lower() + + # Normalize the text. NFKC normalization has two effects: + # 1. It canonicalizes the text, ie. maps all visually identical strings to the same + # string. For example, ["e", "◌́"] is mapped to ["é"]. + # 2. It maps strings that are roughly equivalent to the same string. + # For example, ["dž"] is mapped to ["d", "ž"], ["①"] to ["1"] and ["i⁹"] to + # ["i", "9"]. + text = unicodedata.normalize("NFKC", text) + + # Note that nothing is done to make searches accent-insensitive. + # That could be achieved by converting to NFKD form instead (with combining accents + # split out) and filtering out combining accents using `unicodedata.combining(c)`. + # The downside of this may be noisier search results, since search terms with + # explicit accents will match characters with no accents, or completely different + # accents. + # + # text = unicodedata.normalize("NFKD", text) + # text = "".join([c for c in text if not unicodedata.combining(c)]) + + return text + + def _parse_query_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form that can be passed to database. @@ -905,6 +950,7 @@ def _parse_query_sqlite(search_term: str) -> str: We specifically add both a prefix and non prefix matching term so that exact matches get ranked higher. """ + search_term = _filter_text_for_index(search_term) # Pull out the individual words, discarding any non-word characters. results = _parse_words(search_term) @@ -917,6 +963,8 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ + search_term = _filter_text_for_index(search_term) + escaped_words = [] for word in _parse_words(search_term): # Postgres tsvector and tsquery quoting rules: diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 2d169684cf..43b724c4dd 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -504,6 +504,139 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, ) + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_ascii_case_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name in a + different case. + """ + CHARLIE = "@someuser:example.org" + self.get_success( + self.store.update_profile_in_user_dir(CHARLIE, "Charlie", None) + ) + + r = self.get_success(self.store.search_user_dir(ALICE, "cHARLIE", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": CHARLIE, "display_name": "Charlie", "avatar_url": None}, + ) + + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_unicode_case_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name in a + different case. + """ + IVAN = "@someuser:example.org" + self.get_success(self.store.update_profile_in_user_dir(IVAN, "Иван", None)) + + r = self.get_success(self.store.search_user_dir(ALICE, "иВАН", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": IVAN, "display_name": "Иван", "avatar_url": None}, + ) + + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name in a + different case, when their name contains dotted or dotless "i"s. + + Some languages have dotted and dotless versions of "i", which are considered to + be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the + ASCII "i" and "I" code points, despite having different lowercase / uppercase + forms. + """ + USER = "@someuser:example.org" + + expected_matches = [ + # (search_term, display_name) + # A search for "i" should match "İ". + ("iiiii", "İİİİİ"), + # A search for "I" should match "ı". + ("IIIII", "ııııı"), + # A search for "ı" should match "I". + ("ııııı", "IIIII"), + # A search for "İ" should match "i". + ("İİİİİ", "iiiii"), + ] + + for search_term, display_name in expected_matches: + self.get_success( + self.store.update_profile_in_user_dir(USER, display_name, None) + ) + + r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10)) + self.assertFalse(r["limited"]) + self.assertEqual( + 1, + len(r["results"]), + f"searching for {search_term!r} did not match {display_name!r}", + ) + self.assertDictEqual( + r["results"][0], + {"user_id": USER, "display_name": display_name, "avatar_url": None}, + ) + + # We don't test for negative matches, to allow implementations that consider all + # the i variants to be the same. + + test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore + + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_unicode_normalization(self) -> None: + """Tests that a user can look up another user by searching for their name with + either composed or decomposed accents. + """ + AMELIE = "@someuser:example.org" + + expected_matches = [ + # (search_term, display_name) + ("Ame\u0301lie", "Amélie"), + ("Amélie", "Ame\u0301lie"), + ] + + for search_term, display_name in expected_matches: + self.get_success( + self.store.update_profile_in_user_dir(AMELIE, display_name, None) + ) + + r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10)) + self.assertFalse(r["limited"]) + self.assertEqual( + 1, + len(r["results"]), + f"searching for {search_term!r} did not match {display_name!r}", + ) + self.assertDictEqual( + r["results"][0], + {"user_id": AMELIE, "display_name": display_name, "avatar_url": None}, + ) + + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_accent_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name + without any accents. + """ + AMELIE = "@someuser:example.org" + self.get_success(self.store.update_profile_in_user_dir(AMELIE, "Amélie", None)) + + r = self.get_success(self.store.search_user_dir(ALICE, "amelie", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": AMELIE, "display_name": "Amélie", "avatar_url": None}, + ) + + # It may be desirable for "é"s in search terms to not match plain "e"s and we + # really don't want "é"s in search terms to match "e"s with different accents. + # But we don't test for this to allow implementations that consider all + # "e"-lookalikes to be the same. + + test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore + class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): use_icu = True