This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new c63df4a9 Create lib and tool to repair UTF8 files
c63df4a9 is described below
commit c63df4a9bedb2d4b6ce7a7e48463d81e1c0250a3
Author: Sebb <[email protected]>
AuthorDate: Fri Aug 11 22:51:18 2023 +0100
Create lib and tool to repair UTF8 files
---
lib/whimsy/utf8-utils.rb | 29 +++++++++++++++++++++++++++++
tools/utf8-fix.rb | 13 +++++++++++++
2 files changed, 42 insertions(+)
diff --git a/lib/whimsy/utf8-utils.rb b/lib/whimsy/utf8-utils.rb
new file mode 100755
index 00000000..990d6974
--- /dev/null
+++ b/lib/whimsy/utf8-utils.rb
@@ -0,0 +1,29 @@
+#!/usr/bin/env ruby
+
+# Utility module for working with UTF8
+# Initially only contains a method to repair UTF8 files
+
+module UTF8Utils
+ UTF8_REPLACE = '�'
+
+ #
+ # Initially assumes the file is in utf8-softbank encoding
+ # If that does not work, then it tries ISO-8859-1
+ def self.repair(src, dst)
+ opts = {undef: :replace, invalid: :replace}
+ ec1 = Encoding::Converter.new('utf8-softbank', "UTF-8", **opts)
+ ec2 = Encoding::Converter.new('iso-8859-1', "UTF-8", **opts)
+
+ open(dst,'w:utf-8') do |w|
+ open(src,'rb').each do |l|
+ o = ec1.convert(l) # initial conversion try
+ unless o == l
+ if o.include? UTF8_REPLACE # something did not convert
+ o = ec2.convert(l) # try another encoding
+ end
+ end
+ w.write o
+ end
+ end
+ end
+end
diff --git a/tools/utf8-fix.rb b/tools/utf8-fix.rb
new file mode 100755
index 00000000..44cb455a
--- /dev/null
+++ b/tools/utf8-fix.rb
@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+
+# @(#) fix non-UTF8 source files
+
+$LOAD_PATH.unshift '/srv/whimsy/lib'
+require 'whimsy/utf8-utils'
+
+if __FILE__ == $0
+ src = ARGV.shift or raise Exception.new "need input file"
+ dst = ARGV.shift || src + '.tmp'
+ puts "Input: #{src} output: #{dst}"
+ UTF8Utils::repair(src, dst)
+end