Jordan: BaseTools UPT (Source\Python\UPT\Library\UniClassObject.py) also parses UNI file. Could you update it together? Or, you expect UPT tool owner to follow up?
Thanks Liming -----Original Message----- From: Jordan Justen [mailto:jordan.l.jus...@intel.com] Sent: Tuesday, May 05, 2015 3:09 PM To: edk2-devel@lists.sourceforge.net Subject: [edk2] [PATCH v2 1/7] BaseTools: Support UTF-8 string data in .uni files Since UEFI only support UTF-16LE strings internally, this simply allows for another unicode the source file encoding. The strings are still converted to UTF-16LE data for use in EDK II source code. When .uni files contain UTF-16 data, it is impossible for unicode code points to be larger than 0xFFFF. To support .uni files that contain UTF-8 data, we also need to also deal with the possibility that the UTF-8 file contains unicode code points larger than 16-bits. Since UEFI only supports 16-bit string data, we make UniClassObject generate an error if a larger code point is seen in a UTF-8 string value. We only check string value data, so it is possible to use larger code points in comments. v2: * Drop .utf8 extension. Use .uni file for UTF-8 data (mdkinney) * Merge in 'BaseTools/UniClassObject: Verify string data is 16-bit' commit Cc: Yingke D Liu <yingke.d....@intel.com> Cc: Michael D Kinney <michael.d.kin...@intel.com> Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> --- BaseTools/Source/Python/AutoGen/UniClassObject.py | 38 +++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/BaseTools/Source/Python/AutoGen/UniClassObject.py b/BaseTools/Source/Python/AutoGen/UniClassObject.py index aa54f4f..41448ab 100644 --- a/BaseTools/Source/Python/AutoGen/UniClassObject.py +++ b/BaseTools/Source/Python/AutoGen/UniClassObject.py @@ -209,7 +209,7 @@ class UniFileClassObject(object): Lang = distutils.util.split_quoted((Line.split(u"//")[0])) if len(Lang) != 3: try: - FileIn = codecs.open(LongFilePath(File.Path), mode='rb', encoding='utf-16').read() + FileIn = self.OpenUniFile(LongFilePath(File.Path)) except UnicodeError, X: EdkLogger.error("build", FILE_READ_FAILURE, "File read failure: %s" % str(X), ExtraData=File); except: @@ -253,6 +253,38 @@ class UniFileClassObject(object): self.OrderedStringDict[LangName][Item.StringName] = len(self.OrderedStringList[LangName]) - 1 return True + def OpenUniFile(self, FileName): + Encoding = 'utf-8' + UniFile = open(FileName, 'rb') + + # + # Seek to end of file to determine its size + # + UniFile.seek(0, 2) + FileSize = UniFile.tell() + + if FileSize >= 2: + # + # Seek to start of the file to read the UTF-16 BOM + # + UniFile.seek(0, 0) + Bom = UniFile.read(2) + UniFile.seek(0, 0) + + if Bom == '\xff\xfe': + Encoding = 'utf-16' + + Info = codecs.lookup(Encoding) + return codecs.StreamReaderWriter(UniFile, Info.streamreader, + Info.streamwriter) + + def Verify16bitCodePoints(self, String): + for cp in String: + if ord(cp) > 0xffff: + tmpl = 'The string {} defined in file {} ' + \ + 'contains a character with a code point above 0xFFFF.' + error = tmpl.format(repr(String), self.File) + EdkLogger.error('Unicode File Parser', FORMAT_INVALID, + error) + # # Get String name and value # @@ -274,6 +306,7 @@ class UniFileClassObject(object): Language = LanguageList[IndexI].split()[0] Value = LanguageList[IndexI][LanguageList[IndexI].find(u'\"') + len(u'\"') : LanguageList[IndexI].rfind(u'\"')] #.replace(u'\r\n', u'') Language = GetLanguageCode(Language, self.IsCompatibleMode, self.File) + self.Verify16bitCodePoints(Value) self.AddStringToList(Name, Language, Value) # @@ -305,7 +338,7 @@ class UniFileClassObject(object): EdkLogger.error("Unicode File Parser", FILE_NOT_FOUND, ExtraData=File.Path) try: - FileIn = codecs.open(LongFilePath(File.Path), mode='rb', encoding='utf-16') + FileIn = self.OpenUniFile(LongFilePath(File.Path)) except UnicodeError, X: EdkLogger.error("build", FILE_READ_FAILURE, "File read failure: %s" % str(X), ExtraData=File.Path); except: @@ -426,6 +459,7 @@ class UniFileClassObject(object): MatchString = re.match('[A-Z0-9_]+', Name, re.UNICODE) if MatchString == None or MatchString.end(0) != len(Name): EdkLogger.error('Unicode File Parser', FORMAT_INVALID, 'The string token name %s defined in UNI file %s contains the invalid lower case character.' %(Name, self.File)) + self.Verify16bitCodePoints(Value) self.AddStringToList(Name, Language, Value) continue -- 2.1.4 ------------------------------------------------------------------------------ One dashboard for servers and applications across Physical-Virtual-Cloud Widest out-of-the-box monitoring support with 50+ applications Performance metrics, stats and reports that give you Actionable Insights Deep dive visibility with transaction tracing using APM Insight. http://ad.doubleclick.net/ddm/clk/290420510;117567292;y _______________________________________________ edk2-devel mailing list edk2-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/edk2-devel ------------------------------------------------------------------------------ One dashboard for servers and applications across Physical-Virtual-Cloud Widest out-of-the-box monitoring support with 50+ applications Performance metrics, stats and reports that give you Actionable Insights Deep dive visibility with transaction tracing using APM Insight. http://ad.doubleclick.net/ddm/clk/290420510;117567292;y _______________________________________________ edk2-devel mailing list edk2-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/edk2-devel