| 1 | //===----------------------------------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | // UNSUPPORTED: no-localization |
| 10 | // UNSUPPORTED: c++03, c++11, c++14 |
| 11 | // UNSUPPORTED: availability-filesystem-missing |
| 12 | // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS |
| 13 | |
| 14 | // <filesystem> |
| 15 | |
| 16 | // class path |
| 17 | |
| 18 | // Test constructors, accessors and modifiers that convert from/to various |
| 19 | // character encodings. Constructors and modifiers (append, concat, |
| 20 | // operator/=, operator+=) accept inputs with various character encodings, |
| 21 | // and accessors (*string(), string<>(), u8string()) export the string with |
| 22 | // various encodings. |
| 23 | // |
| 24 | // Some encodings are standardized; char16_t, char32_t and the u8string |
| 25 | // accessor and u8path constructor (and normal functions taking char8_t in |
| 26 | // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either |
| 27 | // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be |
| 28 | // left unimplemented. |
| 29 | // |
| 30 | // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char |
| 31 | // is supposed to be in the same encoding as the platform's native file |
| 32 | // system APIs consumes in the functions that take narrow strings as path |
| 33 | // names. |
| 34 | |
| 35 | #include <filesystem> |
| 36 | #include <type_traits> |
| 37 | #include <cassert> |
| 38 | |
| 39 | #include "test_macros.h" |
| 40 | |
| 41 | #ifdef _WIN32 |
| 42 | # include <windows.h> // SetFileApisToANSI & friends |
| 43 | #endif |
| 44 | namespace fs = std::filesystem; |
| 45 | |
| 46 | // Test conversion with strings that fit within the latin1 charset, that fit |
| 47 | // within one code point in UTF-16, and that can be expressible in certain |
| 48 | // one-byte code pages. |
| 49 | static void test_latin_unicode() |
| 50 | { |
| 51 | const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; |
| 52 | const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 }; |
| 53 | const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string |
| 54 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
| 55 | const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 }; |
| 56 | #else |
| 57 | const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; |
| 58 | #endif |
| 59 | #ifndef TEST_HAS_NO_WIDE_CHARACTERS |
| 60 | const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 }; |
| 61 | #endif |
| 62 | |
| 63 | // Test well-defined conversion between UTF-8, UTF-16 and UTF-32 |
| 64 | { |
| 65 | const fs::path p(u16str); |
| 66 | assert(p.u8string() == u8str); |
| 67 | assert(p.u16string() == u16str); |
| 68 | assert(p.u32string() == u32str); |
| 69 | assert(p.string<char16_t>() == u16str); |
| 70 | assert(p.string<char32_t>() == u32str); |
| 71 | } |
| 72 | { |
| 73 | const fs::path p(u32str); |
| 74 | assert(p.u8string() == u8str); |
| 75 | assert(p.u16string() == u16str); |
| 76 | assert(p.u32string() == u32str); |
| 77 | assert(p.string<char16_t>() == u16str); |
| 78 | assert(p.string<char32_t>() == u32str); |
| 79 | } |
| 80 | { |
| 81 | const fs::path p = fs::u8path(str); |
| 82 | assert(p.u8string() == u8str); |
| 83 | assert(p.u16string() == u16str); |
| 84 | assert(p.u32string() == u32str); |
| 85 | assert(p.string<char16_t>() == u16str); |
| 86 | assert(p.string<char32_t>() == u32str); |
| 87 | } |
| 88 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
| 89 | { |
| 90 | // In C++20, the path constructor can unambiguously handle UTF-8 input, |
| 91 | // even if the plain char constructor would treat it as something else. |
| 92 | const fs::path p(u8str); |
| 93 | assert(p.u8string() == u8str); |
| 94 | assert(p.u16string() == u16str); |
| 95 | assert(p.u32string() == u32str); |
| 96 | assert(p.string<char8_t>() == u8str); |
| 97 | assert(p.string<char16_t>() == u16str); |
| 98 | assert(p.string<char32_t>() == u32str); |
| 99 | } |
| 100 | // Check reading various inputs with string<char8_t>() |
| 101 | { |
| 102 | const fs::path p(u16str); |
| 103 | assert(p.string<char8_t>() == u8str); |
| 104 | } |
| 105 | { |
| 106 | const fs::path p(u32str); |
| 107 | assert(p.string<char8_t>() == u8str); |
| 108 | } |
| 109 | { |
| 110 | const fs::path p = fs::u8path(str); |
| 111 | assert(p.string<char8_t>() == u8str); |
| 112 | } |
| 113 | #endif |
| 114 | #ifndef TEST_HAS_NO_WIDE_CHARACTERS |
| 115 | // Test conversion to/from wchar_t. |
| 116 | { |
| 117 | const fs::path p(u16str); |
| 118 | assert(p.wstring() == wstr); |
| 119 | assert(p.string<wchar_t>() == wstr); |
| 120 | } |
| 121 | { |
| 122 | const fs::path p = fs::u8path(str); |
| 123 | assert(p.wstring() == wstr); |
| 124 | assert(p.string<wchar_t>() == wstr); |
| 125 | } |
| 126 | { |
| 127 | const fs::path p(wstr); |
| 128 | assert(p.wstring() == wstr); |
| 129 | assert(p.u8string() == u8str); |
| 130 | assert(p.u16string() == u16str); |
| 131 | assert(p.u32string() == u32str); |
| 132 | assert(p.string<wchar_t>() == wstr); |
| 133 | } |
| 134 | #endif // TEST_HAS_NO_WIDE_CHARACTERS |
| 135 | #ifndef _WIN32 |
| 136 | // Test conversion to/from regular char-based string. On POSIX, this |
| 137 | // is implied to convert to/from UTF-8. |
| 138 | { |
| 139 | const fs::path p(str); |
| 140 | assert(p.string() == str); |
| 141 | assert(p.u16string() == u16str); |
| 142 | assert(p.string<char>() == str); |
| 143 | } |
| 144 | { |
| 145 | const fs::path p(u16str); |
| 146 | assert(p.string() == str); |
| 147 | assert(p.string<char>() == str); |
| 148 | } |
| 149 | #else |
| 150 | // On windows, the narrow char-based input/output is supposed to be |
| 151 | // in the charset that narrow file IO APIs use. This can either be the |
| 152 | // current active code page (ACP) or the OEM code page, exposed by |
| 153 | // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and |
| 154 | // SetFileApisToOEM(). We can't set which codepage is active within |
| 155 | // the process, but for some specific known ones, we can check if they |
| 156 | // behave as expected. |
| 157 | SetFileApisToANSI(); |
| 158 | if (GetACP() == 1252) { |
| 159 | const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 }; |
| 160 | { |
| 161 | const fs::path p(wstr); |
| 162 | assert(p.string() == latin1); |
| 163 | assert(p.string<char>() == latin1); |
| 164 | } |
| 165 | { |
| 166 | const fs::path p(latin1); |
| 167 | assert(p.string() == latin1); |
| 168 | assert(p.wstring() == wstr); |
| 169 | assert(p.u8string() == u8str); |
| 170 | assert(p.u16string() == u16str); |
| 171 | assert(p.string<char>() == latin1); |
| 172 | assert(p.string<wchar_t>() == wstr); |
| 173 | } |
| 174 | } |
| 175 | SetFileApisToOEM(); |
| 176 | if (GetOEMCP() == 850 || GetOEMCP() == 437) { |
| 177 | // These chars are identical in both CP 850 and 437 |
| 178 | const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 }; |
| 179 | { |
| 180 | const fs::path p(wstr); |
| 181 | assert(p.string() == cp850); |
| 182 | assert(p.string<char>() == cp850); |
| 183 | } |
| 184 | { |
| 185 | const fs::path p(cp850); |
| 186 | assert(p.string() == cp850); |
| 187 | assert(p.wstring() == wstr); |
| 188 | assert(p.u8string() == u8str); |
| 189 | assert(p.u16string() == u16str); |
| 190 | assert(p.string<char>() == cp850); |
| 191 | assert(p.string<wchar_t>() == wstr); |
| 192 | } |
| 193 | } |
| 194 | #endif |
| 195 | } |
| 196 | |
| 197 | // Test conversion with strings that don't fit within one UTF-16 code point. |
| 198 | // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the |
| 199 | // particular platform. |
| 200 | static void test_wide_unicode() |
| 201 | { |
| 202 | const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; |
| 203 | const char32_t u32str[] = { 0x10437, 0x00 }; |
| 204 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
| 205 | const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 }; |
| 206 | #else |
| 207 | const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
| 208 | #endif |
| 209 | const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
| 210 | { |
| 211 | const fs::path p = fs::u8path(source: str); |
| 212 | assert(p.u8string() == u8str); |
| 213 | assert(p.u16string() == u16str); |
| 214 | assert(p.u32string() == u32str); |
| 215 | } |
| 216 | { |
| 217 | const fs::path p(u16str); |
| 218 | assert(p.u8string() == u8str); |
| 219 | assert(p.u16string() == u16str); |
| 220 | assert(p.u32string() == u32str); |
| 221 | } |
| 222 | { |
| 223 | const fs::path p(u32str); |
| 224 | assert(p.u8string() == u8str); |
| 225 | assert(p.u16string() == u16str); |
| 226 | assert(p.u32string() == u32str); |
| 227 | } |
| 228 | #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
| 229 | # if __SIZEOF_WCHAR_T__ == 2 |
| 230 | const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; |
| 231 | # else |
| 232 | const wchar_t wstr[] = { 0x10437, 0x00 }; |
| 233 | # endif |
| 234 | // Test conversion to/from wchar_t. |
| 235 | { |
| 236 | const fs::path p = fs::u8path(source: str); |
| 237 | assert(p.wstring() == wstr); |
| 238 | } |
| 239 | { |
| 240 | const fs::path p(u16str); |
| 241 | assert(p.wstring() == wstr); |
| 242 | } |
| 243 | { |
| 244 | const fs::path p(u32str); |
| 245 | assert(p.wstring() == wstr); |
| 246 | } |
| 247 | { |
| 248 | const fs::path p(wstr); |
| 249 | assert(p.u8string() == u8str); |
| 250 | assert(p.u16string() == u16str); |
| 251 | assert(p.u32string() == u32str); |
| 252 | assert(p.wstring() == wstr); |
| 253 | } |
| 254 | #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
| 255 | } |
| 256 | |
| 257 | // Test appending paths in different encodings. |
| 258 | static void test_append() |
| 259 | { |
| 260 | const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; |
| 261 | const char32_t u32str[] = { 0x10437, 0x00 }; |
| 262 | const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 }; |
| 263 | const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
| 264 | { |
| 265 | fs::path p = fs::u8path(source: str) / u16str / u32str; |
| 266 | assert(p.u32string() == u32ref); |
| 267 | p = fs::u8path(source: str).append(source: u16str).append(source: u32str); |
| 268 | assert(p.u32string() == u32ref); |
| 269 | p = fs::u8path(source: str); |
| 270 | p /= u16str; |
| 271 | p /= u32str; |
| 272 | assert(p.u32string() == u32ref); |
| 273 | } |
| 274 | #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
| 275 | # if __SIZEOF_WCHAR_T__ == 2 |
| 276 | const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; |
| 277 | # else |
| 278 | const wchar_t wstr[] = { 0x10437, 0x00 }; |
| 279 | # endif |
| 280 | // Test conversion from wchar_t. |
| 281 | { |
| 282 | fs::path p = fs::path(u16str) / wstr / u32str; |
| 283 | assert(p.u32string() == u32ref); |
| 284 | p = fs::path(u16str).append(source: wstr).append(source: u32str); |
| 285 | assert(p.u32string() == u32ref); |
| 286 | p = fs::path(u16str); |
| 287 | p /= wstr; |
| 288 | p /= u32str; |
| 289 | assert(p.u32string() == u32ref); |
| 290 | } |
| 291 | #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
| 292 | } |
| 293 | |
| 294 | static void test_concat() |
| 295 | { |
| 296 | const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 }; |
| 297 | const char32_t u32str[] = { 0x10437, 0x00 }; |
| 298 | const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 }; |
| 299 | const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 }; |
| 300 | { |
| 301 | fs::path p = fs::u8path(source: str); |
| 302 | p += u16str; |
| 303 | p += u32str; |
| 304 | assert(p.u32string() == u32ref); |
| 305 | p = fs::u8path(source: str).concat(x: u16str).concat(x: u32str); |
| 306 | assert(p.u32string() == u32ref); |
| 307 | } |
| 308 | #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
| 309 | # if __SIZEOF_WCHAR_T__ == 2 |
| 310 | const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 }; |
| 311 | # else |
| 312 | const wchar_t wstr[] = { 0x10437, 0x00 }; |
| 313 | # endif |
| 314 | // Test conversion from wchar_t. |
| 315 | { |
| 316 | fs::path p = fs::path(u16str); |
| 317 | p += wstr; |
| 318 | p += u32str; |
| 319 | assert(p.u32string() == u32ref); |
| 320 | p = fs::path(u16str).concat(x: wstr).concat(x: u32str); |
| 321 | assert(p.u32string() == u32ref); |
| 322 | } |
| 323 | #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__) |
| 324 | } |
| 325 | |
| 326 | static void test_append_concat_narrow() |
| 327 | { |
| 328 | const char16_t u16str[] = { 0xe5, 0x00 }; |
| 329 | const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 }; |
| 330 | const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 }; |
| 331 | |
| 332 | #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t) |
| 333 | { |
| 334 | const char8_t u8str[] = { 0xc3, 0xa5, 0x00 }; |
| 335 | // In C++20, appends of a char8_t string is unambiguously treated as |
| 336 | // UTF-8. |
| 337 | fs::path p = fs::path(u16str) / u8str; |
| 338 | assert(p.u32string() == u32ref_append); |
| 339 | p = fs::path(u16str).append(u8str); |
| 340 | assert(p.u32string() == u32ref_append); |
| 341 | p = fs::path(u16str); |
| 342 | p /= u8str; |
| 343 | assert(p.u32string() == u32ref_append); |
| 344 | p = fs::path(u16str).concat(u8str); |
| 345 | assert(p.u32string() == u32ref_concat); |
| 346 | p = fs::path(u16str); |
| 347 | p += u8str; |
| 348 | assert(p.u32string() == u32ref_concat); |
| 349 | } |
| 350 | #endif |
| 351 | #ifndef _WIN32 |
| 352 | // Test appending a regular char-based string. On POSIX, this |
| 353 | // is implied to convert to/from UTF-8. |
| 354 | { |
| 355 | const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string |
| 356 | fs::path p = fs::path(u16str) / str; |
| 357 | assert(p.u32string() == u32ref_append); |
| 358 | p = fs::path(u16str).append(source: str); |
| 359 | assert(p.u32string() == u32ref_append); |
| 360 | p = fs::path(u16str); |
| 361 | p /= str; |
| 362 | assert(p.u32string() == u32ref_append); |
| 363 | p = fs::path(u16str).concat(x: str); |
| 364 | assert(p.u32string() == u32ref_concat); |
| 365 | p = fs::path(u16str); |
| 366 | p += str; |
| 367 | assert(p.u32string() == u32ref_concat); |
| 368 | } |
| 369 | #else |
| 370 | SetFileApisToANSI(); |
| 371 | if (GetACP() == 1252) { |
| 372 | const char latin1[] = { char(0xe5), 0x00 }; |
| 373 | fs::path p = fs::path(u16str) / latin1; |
| 374 | assert(p.u32string() == u32ref_append); |
| 375 | p = fs::path(u16str).append(latin1); |
| 376 | assert(p.u32string() == u32ref_append); |
| 377 | p = fs::path(u16str); |
| 378 | p /= latin1; |
| 379 | assert(p.u32string() == u32ref_append); |
| 380 | p = fs::path(u16str).concat(latin1); |
| 381 | assert(p.u32string() == u32ref_concat); |
| 382 | p = fs::path(u16str); |
| 383 | p += latin1; |
| 384 | assert(p.u32string() == u32ref_concat); |
| 385 | } |
| 386 | SetFileApisToOEM(); |
| 387 | if (GetOEMCP() == 850 || GetOEMCP() == 437) { |
| 388 | // This chars is identical in both CP 850 and 437 |
| 389 | const char cp850[] = { char(0x86), 0x00 }; |
| 390 | fs::path p = fs::path(u16str) / cp850; |
| 391 | assert(p.u32string() == u32ref_append); |
| 392 | p = fs::path(u16str).append(cp850); |
| 393 | assert(p.u32string() == u32ref_append); |
| 394 | p = fs::path(u16str); |
| 395 | p /= cp850; |
| 396 | assert(p.u32string() == u32ref_append); |
| 397 | p = fs::path(u16str).concat(cp850); |
| 398 | assert(p.u32string() == u32ref_concat); |
| 399 | p = fs::path(u16str); |
| 400 | p += cp850; |
| 401 | assert(p.u32string() == u32ref_concat); |
| 402 | } |
| 403 | #endif |
| 404 | } |
| 405 | |
| 406 | int main(int, char**) |
| 407 | { |
| 408 | test_latin_unicode(); |
| 409 | test_wide_unicode(); |
| 410 | test_append(); |
| 411 | test_concat(); |
| 412 | test_append_concat_narrow(); |
| 413 | |
| 414 | return 0; |
| 415 | } |
| 416 | |