diff --git a/Cargo.lock b/Cargo.lock index 24e147a..b0753c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -140,6 +140,101 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "futures" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac" + +[[package]] +name = "futures-executor" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb" + +[[package]] +name = "futures-macro" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9" + +[[package]] +name = "futures-task" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + [[package]] name = "getopts" version = "0.2.21" @@ -324,10 +419,12 @@ dependencies = [ "pulldown-cmark-to-cmark", "rayon", "regex", + "rstest", "serde_yaml", "slug", "snafu", "tempfile", + "unicode-normalization", "walkdir", ] @@ -358,6 +455,18 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pretty_assertions" version = "1.3.0" @@ -468,6 +577,40 @@ dependencies = [ "winapi", ] +[[package]] +name = "rstest" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9c9dc66cc29792b663ffb5269be669f1613664e69ad56441fdb895c2347b930" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5015e68a0685a95ade3eee617ff7101ab6a3fc689203101ca16ebc16f2b89c66" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "ryu" version = "1.0.11" @@ -489,6 +632,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "semver" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" + [[package]] name = "serde" version = "1.0.147" @@ -508,6 +657,15 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "slab" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] + [[package]] name = "slug" version = "0.1.4" @@ -573,6 +731,21 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + [[package]] name = "unicase" version = "2.6.0" @@ -582,6 +755,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-width" version = "0.1.10" diff --git a/Cargo.toml b/Cargo.toml index f1a373e..d19d065 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,8 +39,10 @@ regex = "1.7.0" serde_yaml = "0.9.14" slug = "0.1.4" snafu = "0.7.3" +unicode-normalization = "0.1.22" [dev-dependencies] pretty_assertions = "1.3.0" +rstest = "0.15.0" tempfile = "3.3.0" walkdir = "2.3.2" diff --git a/src/lib.rs b/src/lib.rs index b1291e2..ee6510f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,7 @@ use std::io::prelude::*; use std::io::ErrorKind; use std::path::{Path, PathBuf}; use std::str; +use unicode_normalization::UnicodeNormalization; /// A series of markdown [Event]s that are generated while traversing an Obsidian markdown note. pub type MarkdownEvents<'a> = Vec>; @@ -706,22 +707,33 @@ impl<'a> Exporter<'a> { } } +/// Get the full path for the given filename when it's contained in vault_contents, taking into +/// account: +/// +/// 1. Standard Obsidian note references not including a .md extension. +/// 2. Case-insensitive matching +/// 3. Unicode normalization rules using normalization form C +/// (https://www.w3.org/TR/charmod-norm/#unicodeNormalization) fn lookup_filename_in_vault<'a>( filename: &str, vault_contents: &'a [PathBuf], ) -> Option<&'a PathBuf> { - // Markdown files don't have their .md extension added by Obsidian, but other files (images, - // PDFs, etc) do so we match on both possibilities. - // - // References can also refer to notes in a different case (to lowercase text in a - // sentence even if the note is capitalized for example) so we also try a case-insensitive - // lookup. + let filename = PathBuf::from(filename); + let filename_normalized = filename.to_string_lossy().nfc().collect::(); + vault_contents.iter().find(|path| { - let path_lowered = PathBuf::from(path.to_string_lossy().to_lowercase()); - path.ends_with(filename) - || path_lowered.ends_with(&filename.to_lowercase()) - || path.ends_with(format!("{}.md", &filename)) - || path_lowered.ends_with(format!("{}.md", &filename.to_lowercase())) + let path_normalized_str = path.to_string_lossy().nfc().collect::(); + let path_normalized = PathBuf::from(&path_normalized_str); + let path_normalized_lowered = PathBuf::from(&path_normalized_str.to_lowercase()); + + // It would be convenient if we could just do `filename.set_extension("md")` at the start + // of this funtion so we don't need multiple separate + ".md" match cases here, however + // that would break with a reference of `[[Note.1]]` linking to `[[Note.1.md]]`. + + path_normalized.ends_with(&filename_normalized) + || path_normalized.ends_with(filename_normalized.clone() + ".md") + || path_normalized_lowered.ends_with(&filename_normalized.to_lowercase()) + || path_normalized_lowered.ends_with(filename_normalized.to_lowercase() + ".md") }) } @@ -876,3 +888,77 @@ fn codeblock_kind_to_owned<'a>(codeblock_kind: CodeBlockKind) -> CodeBlockKind<' CodeBlockKind::Fenced(cowstr) => CodeBlockKind::Fenced(CowStr::from(cowstr.into_string())), } } + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + use rstest::rstest; + + lazy_static! { + static ref VAULT: Vec = vec![ + PathBuf::from("NoteA.md"), + PathBuf::from("Document.pdf"), + PathBuf::from("Note.1.md"), + PathBuf::from("nested/NoteA.md"), + PathBuf::from("Note\u{E4}.md"), // Noteä.md, see also encodings() below + ]; + } + + #[test] + fn encodings() { + // Standard "Latin Small Letter A with Diaeresis" (U+00E4) + // Encoded in UTF-8 as two bytes: 0xC3 0xA4 + assert_eq!(String::from_utf8(vec![0xC3, 0xA4]).unwrap(), "ä"); + assert_eq!("\u{E4}", "ä"); + + // Basic (ASCII) lowercase a followed by Unicode Character “◌̈” (U+0308) + // Renders the same visual appearance but is encoded in UTF-8 as three bytes: + // 0x61 0xCC 0x88 + assert_eq!(String::from_utf8(vec![0x61, 0xCC, 0x88]).unwrap(), "ä"); + assert_eq!("a\u{308}", "ä"); + assert_eq!("\u{61}\u{308}", "ä"); + + // For more examples and a better explanation of this concept, see + // https://www.w3.org/TR/charmod-norm/#aringExample + } + + #[rstest] + // Exact match + #[case("NoteA.md", "NoteA.md")] + #[case("NoteA", "NoteA.md")] + // Same note in subdir, exact match should find it + #[case("nested/NoteA.md", "nested/NoteA.md")] + #[case("nested/NoteA", "nested/NoteA.md")] + // Different extensions + #[case("Document.pdf", "Document.pdf")] + #[case("Note.1", "Note.1.md")] + #[case("Note.1.md", "Note.1.md")] + // Case-insensitive matches + #[case("notea.md", "NoteA.md")] + #[case("notea", "NoteA.md")] + #[case("NESTED/notea.md", "nested/NoteA.md")] + #[case("NESTED/notea", "nested/NoteA.md")] + // "Latin Small Letter A with Diaeresis" (U+00E4) + #[case("Note\u{E4}.md", "Note\u{E4}.md")] + #[case("Note\u{E4}", "Note\u{E4}.md")] + // Basic (ASCII) lowercase a followed by Unicode Character “◌̈” (U+0308) + // The UTF-8 encoding is different but it renders the same visual appearance as the case above, + // so we expect it to find the same file. + #[case("Note\u{61}\u{308}.md", "Note\u{E4}.md")] + #[case("Note\u{61}\u{308}", "Note\u{E4}.md")] + // We should expect this to work with lowercasing as well, so NoteÄ should find Noteä + // NoteÄ where Ä = Single Ä (U+00C4) + #[case("Note\u{C4}.md", "Note\u{E4}.md")] + #[case("Note\u{C4}", "Note\u{E4}.md")] + // NoteÄ where Ä = decomposed to A (U+0041) + ◌̈ (U+0308) + #[case("Note\u{41}\u{308}.md", "Note\u{E4}.md")] + #[case("Note\u{41}\u{308}", "Note\u{E4}.md")] + fn test_lookup_filename_in_vault(#[case] input: &str, #[case] expected: &str) { + let result = lookup_filename_in_vault(input, &VAULT); + println!("Test input: {:?}", input); + println!("Expecting: {:?}", expected); + println!("Got: {:?}", result.unwrap_or(&PathBuf::from(""))); + assert_eq!(result, Some(&PathBuf::from(expected))) + } +}