From 86153fbba9cebaf4ecc95fa43a0853ef65c737e6 Mon Sep 17 00:00:00 2001 From: Nicholas Marriott Date: Fri, 1 Aug 2025 09:45:18 +0100 Subject: [PATCH] Add UTF-8 regress test. --- regress/UTF-8-test.txt | Bin 0 -> 22781 bytes regress/utf8-test.result | 301 +++++++++++++++++++++++++++++++++++++++ regress/utf8-test.sh | 21 +++ 3 files changed, 322 insertions(+) create mode 100644 regress/UTF-8-test.txt create mode 100644 regress/utf8-test.result create mode 100644 regress/utf8-test.sh diff --git a/regress/UTF-8-test.txt b/regress/UTF-8-test.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b GIT binary patch literal 22781 zcmWFyanrR>NJ&l3Pf0CONKPzBOv=p3EU8pT%u7)yE-6YaE>EiU2GB@%FP`6d=+ zmli8{muBQC*kqKH6j(}L`XB(O7+bQTO7#SFv>Ka(+ z8d)glDmXhUI7KR$=oxTvg=Azx>{dw2%t=*9PRvutNX;owsLU@_NUccB&CE;HQOL+I z2MH7@pn4=DF)t-2wV10cu_!aYv{<2}vLLlsAwNwaIlrhVHMvAbAvZB6Ex#x?HAP1u zzepj!BqOz`JhM1eAv3QewJ5c?Br`7^VkTE{YGG+=UUF)&o&wlOVAo_8E9B*uDCDLl z=9MUvtqKspMkWvO`zB^mjp=@}sX3MHu(5bG6+OViU+i%S#|xeAI>(^88PlX6nwk)D*A zkyw_QUs|M)SfWszpPQ;skdp}Zu4hTHLSkmFLT0g!LS|lZNorz>4#*|&Xei0Z%*$3t zOv*1U(E+CzP*MRqUIAfhej2tI%Fk0sTF!EGjP1Ey>JH zRY)u?$;dAP$2dyDC{{?z%u4}zG$%E&xI`g8FO{n}6_Q2@it>|kQganDL4lZ>Sp*ND z(qd3z0Eadx?)5-P6B5A+IhlE>#R^HOIr-%Z$@yieMG8szrFkidMU|k$kdg`VPcc$r zLr+pV3W>!E<*7M2AR3xBb1D^5^FaAA1)^F3o&Z516PlL^l2ZT?Tm|`=c_qaP;2iHTwb7d>-uyc+2lC+jFADx{>AWhSR`6{RMpW|o1pLqfMWBQXaQStY4O1x2YP z3W*9z@cadLKFB#G8L0}ziMgquG!2Ql%o0eJ1L-RU<)MtkqQvBq)FOzlGK)dkqc}4q z6`Z6KQ}PQ+Qd1O)OOuLIOO!xv<5Eh7X@Q1BVo@qMbCzd<(kB*u3J~`ufpU9FW^!Ul zY6@3LK3HLCacYs0LQL`?FWF}`QsfnO$U6u*Tb2*g?DVgB%uQanbLm?5j zv(t+5b3ri%DN=JPx!}nbmPTO_1CPtB(&7?u+$5GIX6As4AxL&pNG>fZO3f?bO3TkH zQAjE+K`N5dN^^256>?J36LU}tIi!ROE@t!8OSp0q^HLH^K&d6OSWm$-4;=EziN&cp z3Si4K)ARC+Kq(h9ffg&|7jc#5A)E$}hGMYsV24A?cu<)Sw_Zn~JToUpAvZBQl`ADR zsWd$uY(aieszOR;S{kUBhnA0^N-QTeF$JU@l+-|_L|SQaW`3R?7gqqNOiEQq&PYwp z2E}(Os1m>z0hz@L8itynG**;aoSFy97>OlZi8(ns3K~YJ;z^aDIy1Q>rxH}=DS%3} zoK%obP*IPb=JJcUQVUBn%Mx=yiLp2{Jufjw6H>ry7;A#_Nm_nRPJTJan}wzMC5fPN z4xH)~64O9=H!+VZGY1@F=xIH(7*+>E5?Vn{VkM{&&eTiQQz$6PFGwxoDoQP`%uCKF z%FoNJf*4(rn4MY-sh1Qob8}NuG80Qub1LDsA!ROjeGRHlGeHG@4yeqjELKno@o-ge z^>a~zl|(t9Dgac3>ws#NOmID=kdhBdM#ZV1B$Qa91F5^9xmqE)D6u#LRQ}}Wrh=Lt zNzm#Vk~yJ8aAHn*VkM|n;7U#`ElyQP%P#`Ob8>1CD3%>_a-h`)IOW4!2r9HvD-yvm zWNwKl0g4rJ^7GOaGzxM+&6tc-uwJgT)YKGBNb*jB)G|5>pduLLnPP?1yc7jc;isTf zqokvdT9KSuPy%)ZsQ4`@&j&Y7xFBkc^bGY3LFK4|k)DyBAt>E}%9-T+ypqJsJcYdc zJl%q#%)AnCUUdw1_Vfh#rKBi7ru{B1q!y=wT#D>xct}FRN};p>6lcl#IiJ57Jgl1SQF0P=tZ97pN(sP*|FooSjn%@)aaERf5_%C?z$xMF6TfQn{cN zutHvGZc=IysJ2Ba0W$MI4XmO{a0GzbIEf{oGQKx5ZpbaHV8^IM5S4JwR6$z?bixm=c zit`l`6$(;|(o(_cD=`_ABET`K0cz)G=0Vzh#hE3g;QUq$@@gJZV^LF~I6tQpoXa7V z8l=Ec0Qp)WGcP5xEHeevUQ4PhNd<))Qmp~ckFX?wwW5Qk97w?eNjSwNMTsS;>6HqZ zd0gO#gJwfe=%?l>6hO*LaET7etbSaskRq`vzi^Gg+=84*$(rYg7>rKV;pl;;H+~5*U0aWc5<8})+85+1x&rm_Z4b(MJ zD9A4^&IEP6;4Q!WG*AO7wFq3+=A`DOmt3Z_N!HU=ecNTKaw?&hXotB{(aV40?nl%`;-p!9Y=(SZl{ z2-wG<1P(FL71X}BRY*-$urL52TLq}?AJkUPJSP$Phh z1Ur^$L=Bg*f`Tt>)EYjjMq>L0;T-CxMMz4_ORXqKO$LocfclV`d5|6jv<^d9Pmo1@ z_ql=w>Pqpr57ydEN>wOMEl31)K5{A*Ambl7pwT|i$Q#VzHCTFkpsr75NwGqHd7d6j z9kDd^{c8;F+C$r(7?Gi1P+?(UrJzJ?Bd3N7WFD-t2M+?EM6QBCMN*oTf)cUi4Q>2u z3~G5|__Y}1PhuSgjWCFRO+XD#9Oe-lau5R{=9z+;ow&>+$%PQ}%s_2UoaT|@LWp_h zp!Ozi^GI+Z#5@ZH1#`@_0*Q}?{SEtxcOh-ki=~1>Fl-ne)EUdkQ7|(>bks0ILO}zR zZ*)P49npXU#W3~Vr=Xyq)X><}+|t_C-qG3B-P7CG53-3o3~}FtiIXNznL2IyjG41$ z&zU=KK6z$=bV1BpuyE1hB}NRWEtp~}Fhau)|*tlu)maW^i@7TF(_ny7` zNJww!wnEH97bTw$O3$FuFFy}7@LXJyScEiBLq;u36Ze7If}jCrVLM!ZU^E3NI91D!fv7t?)+St-?En_X8du#tI6c_^?2a z4ihZVkp_y6G;{3P4KegFsPo$gg^voK6h13_QTVFxP2oEU;g6oYX_G%pAn{>>9v`OI z;=@AMpdu}4NXN$yg`Wz)6n;|>53qQlO?;R_;sc{PGQ$=h$)Na18Pf6bN8v9O;sO*W zw1^L5L~FMgG93q=0>^0OD&%GsgO&(@+<^;I-+iEh6x2v526dys>spfYa|?1(OF-Rm z$QnH49y)kg0=!!ea&As4l4G&;&vg_)%M8GC`Y@9~Gh5*GJfNjAdFg12B2dhOPX>eL zqQRwfej0c&L>_3NouL7E(y3TaLBkog?xh&CtP3=|3RzF3V5EsKkow_gtOp)}L2KzD z#WN^y;ZdsrnzDyaB`6(0Z`pv_8^)ma23pI4Nb??G^z0x88Y?KE)r~|N_yKd=5n`f= zf&yCUPNa!HuuPmlj5JkHKr5h#H1ZF&$ry;4W(w$SaNK4>rr;oR3QG5}$Bemx0?{!8 zHV-sTs%fR5^d6&A1#zGS2{EH0E3fcTiv zMVe1g0#77hnu*Ad)L?$~Ed+#xRSkS`2WYtyXf+sou#~7iAbnm${UKZkT0#j*fQU75iO9>`LB(E9Y6)n0YJM8Rtu>$(hzhV}^+tLI z3dJR$HN-_JdI}!-<)G#C;Pqye3Lx`AfdXEeo|>nSmztbfTnt+44_ZVAGqHvXea$^& zS#%L-wIIlO;AM)b;3dEXMfoYEpv@u~skwRzjtWY}iD{|O6|)HQpsU`LU`^Jd)U4Fx z5>U%C7qX@qybv7WYea~ER-S_LC(JAmUBgwBnpm8lS8N4dJnUHl-c?f!T3QHRy9`>> zo&#DGpP8RmtdNx0@`?oB$*WOT2lqkVqXOfA2&^faTxAGn#=lTm_j0sUSo26x<*?2#QmaON&68 z&cORwK)t`R%+zun1!(By=N2TEWJ33)<)>lf4@7zgZO4MFcZEhJ)C73=gY&Q!D0;L& z>!tI{A^8y5)1dqTO(T%uZipF>Dh}$QqQt!PR0V?y1JH<`d77R=NWMa9UU6v=XsKx? zJSl^MPoah@u^2QV4)PYnC{SAv?jaq}dR<6@LC*cqa4ZJx@=7cL8w4dl?gQt4c)ofD;s`x=yXgEC%hUf|^E)bqXe++Sj!r5!81F=di>)L?HqV z0$7Z|oq>}<{rV$3vjnuS1vAw`*3d&c^^h44P>`j8R$jwH8{MIx_<*JtL{SH-YtVgK zoRe6bp`f9pucQfz7SLv`B+wQNP-Ld2z>NjvU(miDq~vcAy( zx(3q#x&{-j8dOFQw-^&*BJzq!kP|^=1Z;gH#86NfLF6h&h>@@*lc146u#pxD7Um$2 zCxH!yl#D;xiCo1CF%z<85;}GVS~HofU}20f6S4~RPcvv4FEJ%9EutH=f*01Gh36A+ zT@7lJfij9;sE-0XMMDA?2T6VRq4s9MnG2S&i(z>Syn_o*zCt**2D%LyVFaje3LVFT zb{t@RHE`wxc^}eJ0R;lWASkPb3mz`SJNq`J2-K&S#`Wz5+Kgghz>Vj?IzL$*uWDuA*w*p+ZYL0K8g zYy_%72p+^pPQ7Yu@nL(Iaz`OaN zxg}H^5!s30$Oetrff=9>UC{8BaKwYsB6vq1!b~?<(#4Jc=$geN$uqquV&@!<){H&8PX z?u45Ocjr3d!xNHoK;;FBJHbYwxN|-6;b{TRjZiZY?u45OW`NStdg8(pUv4E`4wR0J z75wt^;B9d5u}@^Td8l7rptg3D@{pWbtOOoxR|3rf6_ur?fCm-8r>o^9=70vh!N)A2 z1WFB8VnG3X?g?_>49XNLrtv`2 zIA8}PXMo1%CdJ29Zvr95kb5e`JGf1GL!yp5~$)E$K6u^r=K;y`u_J@T+N@7VO)P*%%$%%Oi zMX9--(Nl!MMY105@qS*(zo zm;yd-18yGVFcnb4xEOLq7HHXANlqo`1U1BXd~qrG#47NyBH*aNu~>up=|w>oa^wa0 zNEpWePv|N{Lo;2_fmZNj4)PiJge}k{C-{7@^wc6y6Sp)~LDvxKTj&q=+KnZycFI1G+nSg8m|6snxJ#@Qj3ZpNAW<8%)w}s(!{@>(8Et#WX!h?q%D7`?=#mh-e&n(Hzg&ZN32|7S6IX@RQhe8S;$V@OSKA;nw2m?W5ov^bL zL90ALhth#F9BAqqG-aBZmyQzP2(4II)Q^t<_$iY)l^{of4mboKw}=uQps<8yLF57$ zqg0^2`%GbjH*Riju83{n;FUY?_wOTS#1JJU!8InN_X8T_M3@L(yF`|Zo#Z(sg?`-=D5SH9oA>izcB@3*gczkTid?d#reU#~vhfEv_1y=xPwB(_{rf)5-@pIE`u+Pptlz)?!~XsIKJ4GW z|6~9DeINVx@BcV||Gtm&_wWC>e*eCY>-X>fxPSk?kNfxU|J1*K->3fl`=RFGr4X+F zG=KlTPxJTh|FnMpzEA7-@Bg%a|GrQA_wWDQzklE7{{8zu&)>iA^Zfn$Kd;}v@ALZo p`#)``F%v|z^|}|3jiI7NE-kE literal 0 HcmV?d00001 diff --git a/regress/utf8-test.result b/regress/utf8-test.result new file mode 100644 index 00000000..e700cb17 --- /dev/null +++ b/regress/utf8-test.result @@ -0,0 +1,301 @@ +UTF-8 decoder capability and stress test +---------------------------------------- + +Markus Kuhn - 2015-08-28 - CC BY 4.0 + +This test file can help you examine, how your UTF-8 decoder handles +various types of correct, malformed, or otherwise interesting UTF-8 +sequences. This file is not meant to be a conformance test. It does +not prescribe any particular outcome. Therefore, there is no way to +"pass" or "fail" this test file, even though the text does suggest a +preferable decoder behaviour at some places. Its aim is, instead, to +help you think about, and test, the behaviour of your UTF-8 decoder on a +systematic collection of unusual inputs. Experience so far suggests +that most first-time authors of UTF-8 decoders find at least one +serious problem in their decoder using this file. + +The test lines below cover boundary conditions, malformed UTF-8 +sequences, as well as correctly encoded UTF-8 sequences of Unicode code +points that should never occur in a correct UTF-8 file. + +According to ISO 10646-1:2000, sections D.7 and 2.3c, a device +receiving UTF-8 shall interpret a "malformed sequence in the same way +that it interprets a character that is outside the adopted subset" and +"characters that are not within the adopted subset shall be indicated +to the user" by a receiving device. One commonly used approach in +UTF-8 decoders is to replace any malformed UTF-8 sequence by a +replacement character (U+FFFD), which looks a bit like an inverted +question mark, or a similar symbol. It might be a good idea to +visually distinguish a malformed UTF-8 sequence from a correctly +encoded Unicode character that is just not available in the current +font but otherwise fully legal, even though ISO 10646-1 doesn't +mandate this. In any case, just ignoring malformed sequences or +unavailable characters does not conform to ISO 10646, will make +debugging more difficult, and can lead to user confusion. + +Please check, whether a malformed UTF-8 sequence is (1) represented at +all, (2) represented by exactly one single replacement character (or +equivalent signal), and (3) the following quotation mark after an +illegal UTF-8 sequence is correctly displayed, i.e. proper +resynchronization takes place immediately after any malformed +sequence. This file says "THE END" in the last line, so if you don't +see that, your decoder crashed somehow before, which should always be +cause for concern. + +All lines in this file are exactly 79 characters long (plus the line +feed). In addition, all lines end with "|", except for the two test +lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls +U+0000 and U+007F. If you display this file with a fixed-width font, +these "|" characters should all line up in column 79 (right margin). +This allows you to test quickly, whether your UTF-8 decoder finds the +correct number of characters in every line, that is whether each +malformed sequences is replaced by a single replacement character. + +Note that, as an alternative to the notion of malformed sequence used +here, it is also a perfectly acceptable (and in some situations even +preferable) solution to represent each individual byte of a malformed +sequence with a replacement character. If you follow this strategy in +your decoder, then please ignore the "|" column. + + +Here come the tests: | + | +1 Some correct UTF-8 text | + | +You should see the Greek word 'kosme': "κόσμε" | + | +2 Boundary condition test cases | + | +2.1 First possible sequence of a certain length | + | +2.1.1 1 byte (U-00000000): "" +2.1.2 2 bytes (U-00000080): "€" | +2.1.3 3 bytes (U-00000800): "ࠀ" | +2.1.4 4 bytes (U-00010000): "𐀀" | +2.1.5 5 bytes (U-00200000): "�����" | +2.1.6 6 bytes (U-04000000): "������" | + | +2.2 Last possible sequence of a certain length | + | +2.2.1 1 byte (U-0000007F): "" +2.2.2 2 bytes (U-000007FF): "߿" | +2.2.3 3 bytes (U-0000FFFF): "￿" | +2.2.4 4 bytes (U-001FFFFF): "����" | +2.2.5 5 bytes (U-03FFFFFF): "�����" | +2.2.6 6 bytes (U-7FFFFFFF): "������" | + | +2.3 Other boundary conditions | + | +2.3.1 U-0000D7FF = ed 9f bf = "퟿" | +2.3.2 U-0000E000 = ee 80 80 = "" | +2.3.3 U-0000FFFD = ef bf bd = "�" | +2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" | +2.3.5 U-00110000 = f4 90 80 80 = "�" | + | +3 Malformed sequences | + | +3.1 Unexpected continuation bytes | + | +Each unexpected continuation byte should be separately signalled as a | +malformed sequence of its own. | + | +3.1.1 First continuation byte 0x80: "�" | +3.1.2 Last continuation byte 0xbf: "�" | + | +3.1.3 2 continuation bytes: "��" | +3.1.4 3 continuation bytes: "���" | +3.1.5 4 continuation bytes: "����" | +3.1.6 5 continuation bytes: "�����" | +3.1.7 6 continuation bytes: "������" | +3.1.8 7 continuation bytes: "�������" | + | +3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): | + | + "���������������� | + ���������������� | + ���������������� | + ����������������" | + | +3.2 Lonely start characters | + | +3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), | + each followed by a space character: | + | + "� � � � � � � � � � � � � � � � | + � � � � � � � � � � � � � � � � " | + | +3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), | + each followed by a space character: | + | + "� � � � � � � � � � � � � � � � " | + | +3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), | + each followed by a space character: | + | + "� � � � � � � � " | + | +3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), | + each followed by a space character: | + | + "� � � � " | + | +3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), | + each followed by a space character: | + | + "� � " | + | +3.3 Sequences with last continuation byte missing | + | +All bytes of an incomplete sequence should be signalled as a single | +malformed sequence, i.e., you should see only a single replacement | +character in each of the next 10 tests. (Characters as in section 2) | + | +3.3.1 2-byte sequence with last byte missing (U+0000): "�" | +3.3.2 3-byte sequence with last byte missing (U+0000): "�" | +3.3.3 4-byte sequence with last byte missing (U+0000): "�" | +3.3.4 5-byte sequence with last byte missing (U+0000): "����" | +3.3.5 6-byte sequence with last byte missing (U+0000): "�����" | +3.3.6 2-byte sequence with last byte missing (U-000007FF): "�" | +3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "�" | +3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "���" | +3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "����" | +3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�����" | + | +3.4 Concatenation of incomplete sequences | + | +All the 10 sequences of 3.3 concatenated, you should see 10 malformed | +sequences being signalled: | + | + "���������������������������" | + | +3.5 Impossible bytes | + | +The following two bytes cannot appear in a correct UTF-8 string | + | +3.5.1 fe = "�" | +3.5.2 ff = "�" | +3.5.3 fe fe ff ff = "����" | + | +4 Overlong sequences | + | +The following sequences are not malformed according to the letter of | +the Unicode 2.0 standard. However, they are longer then necessary and | +a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 | +decoder" should reject them just like malformed sequences for two | +reasons: (1) It helps to debug applications if overlong sequences are | +not treated as valid representations of characters, because this helps | +to spot problems more quickly. (2) Overlong sequences provide | +alternative representations of characters, that could maliciously be | +used to bypass filters that check only for ASCII characters. For | +instance, a 2-byte encoded line feed (LF) would not be caught by a | +line counter that counts only 0x0a bytes, but it would still be | +processed as a line feed by an unsafe UTF-8 decoder later in the | +pipeline. From a security point of view, ASCII compatibility of UTF-8 | +sequences means also, that ASCII characters are *only* allowed to be | +represented by ASCII bytes in the range 0x00-0x7f. To ensure this | +aspect of ASCII compatibility, use only "safe UTF-8 decoders" that | +reject overlong UTF-8 sequences for which a shorter encoding exists. | + | +4.1 Examples of an overlong ASCII character | + | +With a safe UTF-8 decoder, all of the following five overlong | +representations of the ASCII character slash ("/") should be rejected | +like a malformed UTF-8 sequence, for instance by substituting it with | +a replacement character. If you see a slash below, you do not have a | +safe UTF-8 decoder! | + | +4.1.1 U+002F = c0 af = "��" | +4.1.2 U+002F = e0 80 af = "�" | +4.1.3 U+002F = f0 80 80 af = "�" | +4.1.4 U+002F = f8 80 80 80 af = "�����" | +4.1.5 U+002F = fc 80 80 80 80 af = "������" | + | +4.2 Maximum overlong sequences | + | +Below you see the highest Unicode value that is still resulting in an | +overlong sequence if represented with the given number of bytes. This | +is a boundary test for safe UTF-8 decoders. All five characters should | +be rejected like malformed UTF-8 sequences. | + | +4.2.1 U-0000007F = c1 bf = "��" | +4.2.2 U-000007FF = e0 9f bf = "�" | +4.2.3 U-0000FFFF = f0 8f bf bf = "�" | +4.2.4 U-001FFFFF = f8 87 bf bf bf = "�����" | +4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "������" | + | +4.3 Overlong representation of the NUL character | + | +The following five sequences should also be rejected like malformed | +UTF-8 sequences and should not be treated like the ASCII NUL | +character. | + | +4.3.1 U+0000 = c0 80 = "��" | +4.3.2 U+0000 = e0 80 80 = "�" | +4.3.3 U+0000 = f0 80 80 80 = "�" | +4.3.4 U+0000 = f8 80 80 80 80 = "�����" | +4.3.5 U+0000 = fc 80 80 80 80 80 = "������" | + | +5 Illegal code positions | + | +The following UTF-8 sequences should be rejected like malformed | +sequences, because they never represent valid ISO 10646 characters and | +a UTF-8 decoder that accepts them might introduce security problems | +comparable to overlong UTF-8 sequences. | + | +5.1 Single UTF-16 surrogates | + | +5.1.1 U+D800 = ed a0 80 = "�" | +5.1.2 U+DB7F = ed ad bf = "�" | +5.1.3 U+DB80 = ed ae 80 = "�" | +5.1.4 U+DBFF = ed af bf = "�" | +5.1.5 U+DC00 = ed b0 80 = "�" | +5.1.6 U+DF80 = ed be 80 = "�" | +5.1.7 U+DFFF = ed bf bf = "�" | + | +5.2 Paired UTF-16 surrogates | + | +5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "��" | +5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "��" | +5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "��" | +5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "��" | +5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "��" | +5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "��" | +5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "��" | +5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "��" | + | +5.3 Noncharacter code positions | + | +The following "noncharacters" are "reserved for internal use" by | +applications, and according to older versions of the Unicode Standard | +"should never be interchanged". Unicode Corrigendum #9 dropped the | +latter restriction. Nevertheless, their presence in incoming UTF-8 data | +can remain a potential security risk, depending on what use is made of | +these codes subsequently. Examples of such internal use: | + | + - Some file APIs with 16-bit characters may use the integer value -1 | + = U+FFFF to signal an end-of-file (EOF) or error condition. | + | + - In some UTF-16 receivers, code point U+FFFE might trigger a | + byte-swap operation (to convert between UTF-16LE and UTF-16BE). | + | +With such internal use of noncharacters, it may be desirable and safer | +to block those code points in UTF-8 decoders, as they should never | +occur legitimately in incoming UTF-8 data, and could trigger unsafe | +behaviour in subsequent processing. | + | +Particularly problematic noncharacters in 16-bit applications: | + | +5.3.1 U+FFFE = ef bf be = "￾" | +5.3.2 U+FFFF = ef bf bf = "￿" | + | +Other noncharacters: | + | +5.3.3 U+FDD0 .. U+FDEF = "﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯"| + | +5.3.4 U+nFFFE U+nFFFF (for n = 1..10) | + | + "🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿 | + 򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿" | + | +THE END | + diff --git a/regress/utf8-test.sh b/regress/utf8-test.sh new file mode 100644 index 00000000..3b2b22c5 --- /dev/null +++ b/regress/utf8-test.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +PATH=/bin:/usr/bin +TERM=screen + +[ -z "$TEST_TMUX" ] && TEST_TMUX=$(readlink -f ../tmux) +TMUX="$TEST_TMUX -Ltest" +TMP=$(mktemp) +trap "rm -f $TMP" 0 1 15 +$TMUX kill-server 2>/dev/null + +$TMUX -f/dev/null \ + set -g remain-on-exit on \; \ + set -g remain-on-exit-format '' \; \ + new -d -- cat UTF-8-test.txt +sleep 1 +$TMUX capturep -pCeJS- >$TMP +$TMUX kill-server + +cmp -s $TMP utf8-test.result || exit 1 +exit 0