This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new 3cccafb1 Allow for invalid chars in metadata
3cccafb1 is described below
commit 3cccafb1b96e64fa2a33b45b1eaaa84c190141b0
Author: Sebb <[email protected]>
AuthorDate: Sat Sep 2 01:06:03 2023 +0100
Allow for invalid chars in metadata
---
www/secretary/iclaparser.rb | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/www/secretary/iclaparser.rb b/www/secretary/iclaparser.rb
index 6e18875e..e1c0bacd 100644
--- a/www/secretary/iclaparser.rb
+++ b/www/secretary/iclaparser.rb
@@ -160,7 +160,17 @@ module ICLAParser
begin
reader = PDF::Reader.new(path)
%w(pdf_version info metadata page_count).each do |i|
- metadata[i] = reader.public_send(i)
+ # It looks like some of the values may not be UTF-8
+ # In particular info[:Producer] may have odd characters
+ val = reader.public_send(i)
+ if val.instance_of? String
+ metadata[i] = val.encode("utf-8", "utf-8", :invalid => :replace)
+ elsif val.instance_of? Hash
+ metadata[i] = val.transform_values {|v| v.encode("utf-8", "utf-8",
:invalid => :replace)}
+ else
+ metadata[i] = val.class
+ end
+
end
reader.objects.each do |_k, v|
type = v[:Type] rescue nil
@@ -190,7 +200,7 @@ module ICLAParser
val = v[:V].to_s # might be a symbol
# This is a hack; should really find the font def and use that
if val
- debug[key] = v.inspect
+ # debug[key] = v.inspect
val = encode(val)
if val.length > 0
ckey = canon_field_name(key)
@@ -209,7 +219,7 @@ module ICLAParser
key = v[:T]
val = v[:V].to_s # might be a symbol
if val
- debug[key] = v.inspect
+ # debug[key] = v.inspect
if val.length > 0
data[canon_field_name(key)] = val
end
@@ -250,7 +260,7 @@ module ICLAParser
text = receiver.get_text()
# p text
lines = receiver.get_lines() # do we still need these?
- debug[:lines] = lines
+ # debug[:lines] = lines
if text.length > 3
metadata[:dataSource]['Text'] = true
data[:text] = text
@@ -279,7 +289,7 @@ module ICLAParser
rescue StandardError => e
data[:error] = "Error processing #{path} =>
#{e.inspect}\n#{e.backtrace.join("\n")}"
end
-# data[:debug] = debug
+ data[:debug] = debug
# TODO attempt to classify data[:text] items?
data
end