This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new c63df4a9 Create lib and tool to repair UTF8 files
c63df4a9 is described below

commit c63df4a9bedb2d4b6ce7a7e48463d81e1c0250a3
Author: Sebb <[email protected]>
AuthorDate: Fri Aug 11 22:51:18 2023 +0100

    Create lib and tool to repair UTF8 files
---
 lib/whimsy/utf8-utils.rb | 29 +++++++++++++++++++++++++++++
 tools/utf8-fix.rb        | 13 +++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/lib/whimsy/utf8-utils.rb b/lib/whimsy/utf8-utils.rb
new file mode 100755
index 00000000..990d6974
--- /dev/null
+++ b/lib/whimsy/utf8-utils.rb
@@ -0,0 +1,29 @@
+#!/usr/bin/env ruby
+
+# Utility module for working with UTF8
+# Initially only contains a method to repair UTF8 files
+
+module UTF8Utils
+  UTF8_REPLACE = '�'
+
+  #
+  # Initially assumes the file is in utf8-softbank encoding
+  # If that does not work, then it tries ISO-8859-1
+  def self.repair(src, dst)
+    opts = {undef: :replace, invalid: :replace}
+    ec1 = Encoding::Converter.new('utf8-softbank', "UTF-8", **opts)
+    ec2 = Encoding::Converter.new('iso-8859-1', "UTF-8", **opts)
+
+    open(dst,'w:utf-8') do |w|
+      open(src,'rb').each do |l|
+        o = ec1.convert(l) # initial conversion try
+        unless o == l
+          if o.include? UTF8_REPLACE # something did not convert
+            o = ec2.convert(l) # try another encoding
+          end
+        end
+        w.write o
+      end
+    end
+  end
+end
diff --git a/tools/utf8-fix.rb b/tools/utf8-fix.rb
new file mode 100755
index 00000000..44cb455a
--- /dev/null
+++ b/tools/utf8-fix.rb
@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+
+# @(#) fix non-UTF8 source files
+
+$LOAD_PATH.unshift '/srv/whimsy/lib'
+require 'whimsy/utf8-utils'
+
+if __FILE__ == $0
+  src = ARGV.shift or raise Exception.new "need input file"
+  dst = ARGV.shift || src + '.tmp'
+  puts "Input: #{src} output: #{dst}"
+  UTF8Utils::repair(src, dst)
+end

Reply via email to