On 06/04/15 08:42, Jordan Justen wrote:
> Supplementary Plane characters can exist in UTF-16 files,
> but they are not valid UCS-2 characters.
> 
> For example, this python interpreter code:
>>>> import codecs
>>>> codecs.encode(u'\U00010300', 'utf-16')
> '\xff\xfe\x00\xd8\x00\xdf'
> 
> Therefore the UCS-4 0x00010300 character is encoded as two
> 16-bit numbers (0xd800 0xdf00) in a little endian UTF-16
> file.
> 
> For more information, see:
> http://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF
> 
> This test checks to make sure that BaseTools will reject these
> characters in UTF-16 files.
> 
> The range of 0xd800 - 0xdfff should also be rejected as unicode code
> points because they are reserved for the surrogate pair usage in
> UTF-16 files.
> 
> This test was fixed by the previous commit:
> "BaseTools/UniClassObject: Verify valid UCS-2 chars in UTF-16 .uni files"
> 
> Contributed-under: TianoCore Contribution Agreement 1.0
> Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com>
> Cc: Yingke D Liu <yingke.d....@intel.com>
> Cc: Michael D Kinney <michael.d.kin...@intel.com>
> Cc: Laszlo Ersek <ler...@redhat.com>
> ---
>  BaseTools/Tests/CheckUnicodeSourceFiles.py | 35 
> +++++++++++++++++++++++++++++-
>  1 file changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/BaseTools/Tests/CheckUnicodeSourceFiles.py 
> b/BaseTools/Tests/CheckUnicodeSourceFiles.py
> index 0083ad8..ad5fd18 100644
> --- a/BaseTools/Tests/CheckUnicodeSourceFiles.py
> +++ b/BaseTools/Tests/CheckUnicodeSourceFiles.py
> @@ -38,7 +38,10 @@ class Tests(TestTools.BaseToolsTest):
>      def EncodeToFile(self, encoding, string=None):
>          if string is None:
>              string = self.SampleData
> -        data = codecs.encode(string, encoding)
> +        if encoding is not None:
> +            data = codecs.encode(string, encoding)
> +        else:
> +            data = string
>          path = 'input.uni'
>          self.WriteTmpFile(path, data)
>          return PathClass(self.GetTmpFilePath(path))
> @@ -81,6 +84,36 @@ class Tests(TestTools.BaseToolsTest):
>      def testUtf16InUniFile(self):
>          self.CheckFile('utf_16', shouldPass=True)
>  
> +    def testSupplementaryPlaneUnicodeCharInUtf16File(self):
> +        #
> +        # Supplementary Plane characters can exist in UTF-16 files,
> +        # but they are not valid UCS-2 characters.
> +        #
> +        # This test makes sure that BaseTools rejects these characters
> +        # if seen in a .uni file.
> +        #
> +        data = u'''
> +            #langdef en-US "English"
> +            #string STR_A #language en-US "CodePoint (\U00010300) > 0xFFFF"
> +        '''
> +
> +        self.CheckFile('utf_16', shouldPass=False, string=data)
> +
> +    def testSurrogatePairUnicodeCharInUtf16File(self):
> +        #
> +        # Surrogate Pair code points are used in UTF-16 files to
> +        # encode the Supplementary Plane characters. But, a Surrogate
> +        # Pair code point which is not followed by another Surrogate
> +        # Pair code point might be interpreted as a single code point
> +        # with the Surrogate Pair code point.
> +        #
> +        # This test makes sure that BaseTools rejects these characters
> +        # if seen in a .uni file.
> +        #
> +        data = codecs.BOM_UTF16_LE + '//\x01\xd8 '
> +
> +        self.CheckFile(encoding=None, shouldPass=False, string=data)
> +
>  TheTestSuite = TestTools.MakeTheTestSuite(locals())
>  
>  if __name__ == '__main__':
> 

Reviewed-by: Laszlo Ersek <ler...@redhat.com>

------------------------------------------------------------------------------
_______________________________________________
edk2-devel mailing list
edk2-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/edk2-devel

Reply via email to