New: apply unicode normalization while resolving notes

The unicode standard allows for certain (visually) identical characters to
be represented in different ways.

For example the character ä may be represented as a single combined
codepoint "Latin Small Letter A with Diaeresis" (U+00E4) or by the
combination of "Latin Small Letter A" (U+0061) followed by "Combining
Diaeresis" (U+0308).

When encoded with UTF-8, these are represented as respectively the two
bytes 0xC3 0xA4, and the three bytes 0x61 0xCC 0x88.

A user linking to notes with these characters in their titles would
expect these two variants to link to the same file, given they are
visually identical and have the exact same semantic meaning.

The unicode standard defines a method to deconstruct and normalize these
forms, so that a byte comparison on the normalized forms of these
variants ends up comparing the same thing. This is called Unicode
Normalization, defined in Unicode® Standard Annex #15
(http://www.unicode.org/reports/tr15/).

The W3C Working Group has written an excellent explanation of the
problems regarding string matching, and how unicode normalization helps
with this process: https://www.w3.org/TR/charmod-norm/#unicodeNormalization

With this change, obsidian-export will perform unicode normalization
(specifically the C (or NFC) normalization form) on all note titles
while looking up link references, ensuring visually identical links are
treated as being similar, even if they were encoded as different
variants.

A special thanks to Hans Raaf (@oderwat) for reporting and helping track
down this issue.

---

Closes #126
This commit is contained in:
Nick Groenen 2022-11-19 16:34:17 +01:00
parent c5ba5b7aef
commit b5b2ea2c3b
No known key found for this signature in database
GPG Key ID: 4F0AD019928AE098
3 changed files with 281 additions and 11 deletions

182
Cargo.lock generated
View File

@ -140,6 +140,101 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac"
[[package]]
name = "futures-executor"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb"
[[package]]
name = "futures-macro"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9"
[[package]]
name = "futures-task"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea"
[[package]]
name = "futures-timer"
version = "3.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
[[package]]
name = "futures-util"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
name = "getopts"
version = "0.2.21"
@ -324,10 +419,12 @@ dependencies = [
"pulldown-cmark-to-cmark",
"rayon",
"regex",
"rstest",
"serde_yaml",
"slug",
"snafu",
"tempfile",
"unicode-normalization",
"walkdir",
]
@ -358,6 +455,18 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
[[package]]
name = "pin-project-lite"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pretty_assertions"
version = "1.3.0"
@ -468,6 +577,40 @@ dependencies = [
"winapi",
]
[[package]]
name = "rstest"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9c9dc66cc29792b663ffb5269be669f1613664e69ad56441fdb895c2347b930"
dependencies = [
"futures",
"futures-timer",
"rstest_macros",
"rustc_version",
]
[[package]]
name = "rstest_macros"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5015e68a0685a95ade3eee617ff7101ab6a3fc689203101ca16ebc16f2b89c66"
dependencies = [
"cfg-if",
"proc-macro2",
"quote",
"rustc_version",
"syn",
]
[[package]]
name = "rustc_version"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
dependencies = [
"semver",
]
[[package]]
name = "ryu"
version = "1.0.11"
@ -489,6 +632,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "semver"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4"
[[package]]
name = "serde"
version = "1.0.147"
@ -508,6 +657,15 @@ dependencies = [
"unsafe-libyaml",
]
[[package]]
name = "slab"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
dependencies = [
"autocfg",
]
[[package]]
name = "slug"
version = "0.1.4"
@ -573,6 +731,21 @@ dependencies = [
"once_cell",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "unicase"
version = "2.6.0"
@ -582,6 +755,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "unicode-normalization"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.10"

View File

@ -39,8 +39,10 @@ regex = "1.7.0"
serde_yaml = "0.9.14"
slug = "0.1.4"
snafu = "0.7.3"
unicode-normalization = "0.1.22"
[dev-dependencies]
pretty_assertions = "1.3.0"
rstest = "0.15.0"
tempfile = "3.3.0"
walkdir = "2.3.2"

View File

@ -30,6 +30,7 @@ use std::io::prelude::*;
use std::io::ErrorKind;
use std::path::{Path, PathBuf};
use std::str;
use unicode_normalization::UnicodeNormalization;
/// A series of markdown [Event]s that are generated while traversing an Obsidian markdown note.
pub type MarkdownEvents<'a> = Vec<Event<'a>>;
@ -706,22 +707,33 @@ impl<'a> Exporter<'a> {
}
}
/// Get the full path for the given filename when it's contained in vault_contents, taking into
/// account:
///
/// 1. Standard Obsidian note references not including a .md extension.
/// 2. Case-insensitive matching
/// 3. Unicode normalization rules using normalization form C
/// (https://www.w3.org/TR/charmod-norm/#unicodeNormalization)
fn lookup_filename_in_vault<'a>(
filename: &str,
vault_contents: &'a [PathBuf],
) -> Option<&'a PathBuf> {
// Markdown files don't have their .md extension added by Obsidian, but other files (images,
// PDFs, etc) do so we match on both possibilities.
//
// References can also refer to notes in a different case (to lowercase text in a
// sentence even if the note is capitalized for example) so we also try a case-insensitive
// lookup.
let filename = PathBuf::from(filename);
let filename_normalized = filename.to_string_lossy().nfc().collect::<String>();
vault_contents.iter().find(|path| {
let path_lowered = PathBuf::from(path.to_string_lossy().to_lowercase());
path.ends_with(filename)
|| path_lowered.ends_with(&filename.to_lowercase())
|| path.ends_with(format!("{}.md", &filename))
|| path_lowered.ends_with(format!("{}.md", &filename.to_lowercase()))
let path_normalized_str = path.to_string_lossy().nfc().collect::<String>();
let path_normalized = PathBuf::from(&path_normalized_str);
let path_normalized_lowered = PathBuf::from(&path_normalized_str.to_lowercase());
// It would be convenient if we could just do `filename.set_extension("md")` at the start
// of this funtion so we don't need multiple separate + ".md" match cases here, however
// that would break with a reference of `[[Note.1]]` linking to `[[Note.1.md]]`.
path_normalized.ends_with(&filename_normalized)
|| path_normalized.ends_with(filename_normalized.clone() + ".md")
|| path_normalized_lowered.ends_with(&filename_normalized.to_lowercase())
|| path_normalized_lowered.ends_with(filename_normalized.to_lowercase() + ".md")
})
}
@ -876,3 +888,77 @@ fn codeblock_kind_to_owned<'a>(codeblock_kind: CodeBlockKind) -> CodeBlockKind<'
CodeBlockKind::Fenced(cowstr) => CodeBlockKind::Fenced(CowStr::from(cowstr.into_string())),
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use rstest::rstest;
lazy_static! {
static ref VAULT: Vec<std::path::PathBuf> = vec![
PathBuf::from("NoteA.md"),
PathBuf::from("Document.pdf"),
PathBuf::from("Note.1.md"),
PathBuf::from("nested/NoteA.md"),
PathBuf::from("Note\u{E4}.md"), // Noteä.md, see also encodings() below
];
}
#[test]
fn encodings() {
// Standard "Latin Small Letter A with Diaeresis" (U+00E4)
// Encoded in UTF-8 as two bytes: 0xC3 0xA4
assert_eq!(String::from_utf8(vec![0xC3, 0xA4]).unwrap(), "ä");
assert_eq!("\u{E4}", "ä");
// Basic (ASCII) lowercase a followed by Unicode Character “◌̈” (U+0308)
// Renders the same visual appearance but is encoded in UTF-8 as three bytes:
// 0x61 0xCC 0x88
assert_eq!(String::from_utf8(vec![0x61, 0xCC, 0x88]).unwrap(), "");
assert_eq!("a\u{308}", "");
assert_eq!("\u{61}\u{308}", "");
// For more examples and a better explanation of this concept, see
// https://www.w3.org/TR/charmod-norm/#aringExample
}
#[rstest]
// Exact match
#[case("NoteA.md", "NoteA.md")]
#[case("NoteA", "NoteA.md")]
// Same note in subdir, exact match should find it
#[case("nested/NoteA.md", "nested/NoteA.md")]
#[case("nested/NoteA", "nested/NoteA.md")]
// Different extensions
#[case("Document.pdf", "Document.pdf")]
#[case("Note.1", "Note.1.md")]
#[case("Note.1.md", "Note.1.md")]
// Case-insensitive matches
#[case("notea.md", "NoteA.md")]
#[case("notea", "NoteA.md")]
#[case("NESTED/notea.md", "nested/NoteA.md")]
#[case("NESTED/notea", "nested/NoteA.md")]
// "Latin Small Letter A with Diaeresis" (U+00E4)
#[case("Note\u{E4}.md", "Note\u{E4}.md")]
#[case("Note\u{E4}", "Note\u{E4}.md")]
// Basic (ASCII) lowercase a followed by Unicode Character “◌̈” (U+0308)
// The UTF-8 encoding is different but it renders the same visual appearance as the case above,
// so we expect it to find the same file.
#[case("Note\u{61}\u{308}.md", "Note\u{E4}.md")]
#[case("Note\u{61}\u{308}", "Note\u{E4}.md")]
// We should expect this to work with lowercasing as well, so NoteÄ should find Noteä
// NoteÄ where Ä = Single Ä (U+00C4)
#[case("Note\u{C4}.md", "Note\u{E4}.md")]
#[case("Note\u{C4}", "Note\u{E4}.md")]
// NoteÄ where Ä = decomposed to A (U+0041)+ ◌̈ (U+0308)
#[case("Note\u{41}\u{308}.md", "Note\u{E4}.md")]
#[case("Note\u{41}\u{308}", "Note\u{E4}.md")]
fn test_lookup_filename_in_vault(#[case] input: &str, #[case] expected: &str) {
let result = lookup_filename_in_vault(input, &VAULT);
println!("Test input: {:?}", input);
println!("Expecting: {:?}", expected);
println!("Got: {:?}", result.unwrap_or(&PathBuf::from("")));
assert_eq!(result, Some(&PathBuf::from(expected)))
}
}