Hi Jürgen,

This kind of problem only seems to show up with index entries that start
> with numbers. Does anyone know what causes this, and if so how to prevent
> it?


The problem is that make4ht mistakes these index entries for page numbers.
It tries to detect page numbers which should be turned into links to index
destinations, but sometimes it fails. In this case, it thinks your
subentries start with page numbers, so it turns them to links. I hope the
attached version of the library helps.

Best regards,
Michal
local M = {}
local log = logging.new "indexing"

-- Handle accented characters in files created with \usepackage[utf]{inputenc}
-- this code was originally part of https://github.com/michal-h21/iec2utf/
local enc = {}

local licrs = {}
local codepoint2utf = unicode.utf8.char 
local used_encodings = {}

-- load inputenc encoding file
local function load_encfiles(f)
	local file= io.open(f,"r")
	local encodings = file:read("*all")
	file:close()
	for codepoint, licr in encodings:gmatch('DeclareUnicodeCharacter(%b{})(%b{})') do
		local codepoint = codepoint2utf(tonumber(codepoint:sub(2,-2),16))
		local licr= licr:sub(2,-2):gsub('@tabacckludge','')
		licrs[licr] = codepoint
	end
end

local function sanitize_licr(l)
	return l:gsub(" (.)",function(s) if s:match("[%a]") then return " "..s else return s end end):sub(2,-2)
end

local load_enc = function(enc)
  -- use default encodings if used doesn't provide one
  enc = enc or  {"T1","T2A","T2B","T2C","T3","T5", "LGR"}
	for _,e in pairs(enc) do
		local filename = e:lower() .. "enc.dfu"
    -- don't process an enc file multiple times
    if not used_encodings[filename] then
      local dfufile = kpse.find_file(filename)
      if dfufile then
        load_encfiles(dfufile)
      end
    end
    used_encodings[filename] = true
	end
end



local cache = {}

local get_utf8 = function(input)
	local output = input:gsub('\\IeC[%s]*(%b{})',function(iec)
    -- remove \protect commands 
    local iec = iec:gsub("\\protect%s*", "")
		local code = cache[iec] or licrs[sanitize_licr(iec)] or '\\IeC '..iec
		-- print(iec, code)
		cache[iec] = code
		return code
	end)
	return output
end


-- parse the idx file produced by tex4ht
-- it replaces the document page numbers by index entry number
-- each index entry can then link to place in the HTML file where the
-- \index command had been used

local parse_idx = function(content)
  -- index entry number
  local current_entry = 0
  -- map between index entry number and corresponding HTML file and destination
  local map = {}
  local buffer = {}

  for line in content:gmatch("([^\n]+)") do
    if line:match("^\\beforeentry") then
      -- increment index entry number
      current_entry = current_entry + 1
      local file, dest, locator = line:match("\\beforeentry%s*{(.-)}{(.-)}{(.-)}")
      -- if the third argument to \beforeentry is not empty, 
      -- use it as a index entry locator instead of the index counter
      if locator and locator == "" then locator = nil end
      map[current_entry] = {file = file, dest = dest, locator = locator}
    elseif line:match("^\\indexentry") then
      -- replace the page number with the current
      -- index entry number
      local result = line:gsub("%b{}$", "{"..current_entry .."}")
      buffer[#buffer+1] = get_utf8(result)
    else
      buffer[#buffer+1] = line
    end
  end
  -- return table with page to dest map and updated idx file
  return {map = map, idx = table.concat(buffer, "\n")}
end


local previous
-- replace numbers in .ind file with links back to text
local function replace_index_pages(rest, entries)
  -- keep track of the previous page number
  local count = 0
  local delete_coma = false
  return rest:gsub("(%s*%-*%s*)(,?%s*)(%{?)(%[?)(%d+)(%]?)(%}?)", function(dash, coma, lbrace, lbracket, page, rbracket, rbrace)
    if lbracket == "[" and rbracket == "]" then
      -- don't process numbers in brackets, they are not page numbers
      return nil
    end
    local entry = entries[tonumber(page)]
    count = count + 1
    if entry then
      page = entry.locator or page
      if delete_coma then
        -- if the coma was marked for deletion, remove it. this may happen after line breaks in the index
        coma = ""
      end
      -- if the page number is the same as the previous one, don't create a link
      -- this can happen when we use section numbers as locators. for example, 
      -- we could get 1.1 -- 1.1, 1.1, so we want to keep only the first one
      if page == previous then
        previous = page
        -- if the first page number on a line is the same as the previous one, we need to delete the coma,
        -- otherwise the coma will be left in the output
        if count == 1 then
          delete_coma = true
        end
        return ""
      else
        previous = page
        -- don't forget to reset the delete_coma flag after page change
        delete_coma = false
        -- construct link to the index entry
        return dash .. coma.. lbrace ..  "\\Link[" .. entry.file .."]{".. entry.dest .."}{}" ..  page .."\\EndLink{}" .. rbrace
      end
    else
      return dash .. coma .. lbrace .. lbracket .. page .. rbracket .. rbrace
    end
 end)
end

local function fix_subitems(start, rest)
  -- in xindex, subentries start with a comma, so if the subentry itself is number, it would be mistaken for the page number
  -- the start should contain just \subitem -\
  if start:match("%s*\\subitem %-\\$") then
    -- the keyword in this case is the first item in the rest
    local keyword, newrest = rest:match("(,?[^,]+,)(.+)")
    if keyword and newrest then
      -- join the extracted keyword with the start, newrest should contain only actual page numbers
      return start .. keyword, newrest
    end
  end
  return start, rest
end

-- replace page numbers in the ind file with hyperlinks
local fix_idx_pages = function(content, idxobj)
  local buffer = {}
  local entries = idxobj.map
  for  line in content:gmatch("([^\n]+)")  do
    local line, count = line:gsub("(%s*\\%a+[^%[^,]+)(.+)$", function(start,rest)
      -- reset the previous page number
      previous = nil
      start, rest = fix_subitems(start, rest)
      -- there is a problem when index term itself contains numbers, like Bible verses (1:2),
      -- because they will be detected as page numbers too. I cannot find a good solution 
      -- that wouldn't break something else.
      -- There can be also commands with numbers in braces. These numbers in braces will be ignored, 
      -- as they may be not page numbers
      return start .. replace_index_pages(rest, entries)    end)
    -- longer index entries may be broken over several lines, in that case, we need to process only numbers
    if count == 0 then
      line = line:gsub("(%s*%d+.+)", function(rest)
        return replace_index_pages(rest, entries)
      end)
    end
    buffer[#buffer+1] = line
  end
  return table.concat(buffer, "\n")
end

-- prepare the .idx file produced by tex4ht
-- for use with Xindy or Makeindex
local prepare_idx = function(filename)
  local f = io.open(filename, "r")
  if not f then return nil, "Cannot open file :".. tostring(filename) end
  local content = f:read("*all")
  local idx = parse_idx(content)
  local idxname = os.tmpname()
  local f = io.open(idxname, "w")
  f:write(idx.idx)
  f:close()
  -- return the object with mapping between dummy page numbers 
  -- and link destinations in the files, and the temporary .idx file
  -- these can be used for the processing with the index processor
  return idx, idxname
end

-- add links to a index file
local process_index = function(indname, idx)
  local f = io.open(indname,  "r")
  if not f then return  nil, "Cannot open .ind file: " .. tostring(indname) end
  local content = f:read("*all")
  f:close()

  local newcontent = fix_idx_pages(content, idx)
  local f = io.open(indname,"w")
  f:write(newcontent)
  f:close()
  return true
end

local get_idxname = function(par)
  return par.idxfile or par.input .. ".idx"
end

local prepare_tmp_idx = function(par)
  par.idxfile = mkutils.file_in_builddir(get_idxname(par), par)
  if not par.idxfile or not mkutils.file_exists(par.idxfile) then return nil, "Cannot load idx file " .. (par.idxfile or "''") end
  -- construct the .ind name, based on the .idx name
  par.indfile = par.indfile or par.idxfile:gsub("idx$", "ind")
  load_enc()
  -- save hyperlinks and clean the .idx file
  local idxdata, newidxfile = prepare_idx(par.idxfile)
  if not idxdata then
    -- if the prepare_idx function returns nil, the second reuturned value contains error msg
    return nil, newidxfile
  end
  return  newidxfile, idxdata
end


local splitindex = function(par)
  local files = {}
  local idxfiles = {}
  local buffer 
  local idxfile = get_idxname(par)
  if not idxfile or not mkutils.file_exists(idxfile) then return nil, "Cannot load idx file " .. (idxfile or "''") end
  for line in io.lines(idxfile) do
    local file = line:match("indexentry%[(.-)%]")
    if file then
      -- generate idx name for the current output file
      file =  par.input .. "-" ..file .. ".idx"
      local current = files[file] or {}
      -- remove file name from the index entry
      local indexentry = line:gsub("indexentry%[.-%]", "indexentry")
      -- save the index entry and preseding line to the current buffer
      table.insert(current, buffer)
      table.insert(current, indexentry)
      files[file] = current
    end
    -- 
    buffer = line
  end
  -- save idx files
  for filename, contents in pairs(files) do
    log:info("Saving split index file: " .. filename)
    idxfiles[#idxfiles+1] = filename
    local f = io.open(filename, "w")
    f:write(table.concat(contents, "\n"))
    f:close()
  end
  return idxfiles
end

local function run_indexing_command (command, par)
  -- detect command name from the command. It will be the first word
  local cmd_name = command:match("^[%a]+") or "indexing"
  local xindylog  = logging.new(cmd_name)
  -- support split index
  local subindexes = splitindex(par) or {}
  if #subindexes > 0 then
    -- call the command again on all files produced by splitindex
    for _, subindex in ipairs(subindexes) do
      -- make copy of the parameters
      local t = {}
      for k,v in pairs(par) do t[k] = v end
      t.idxfile = subindex
      run_indexing_command(command, t)
    end
    return nil
  end
  local newidxfile, idxdata = prepare_tmp_idx(par)
  if not newidxfile then
    -- the idxdata will contain error message in the case of error
    xindylog:warning(idxdata)
    return false
  end
  par.newidxfile = newidxfile
  xindylog:debug("Prepared temporary idx file: ", newidxfile)
  -- prepare modules
  local xindy_call = command % par
  xindylog:info(xindy_call)
  local status = mkutils.execute(xindy_call)
  -- insert correct links to the index
  local status, msg = process_index(par.indfile, idxdata)
  if not status then xindylog:warning(msg) end
  -- remove the temporary idx file
  os.remove(newidxfile)
  -- null the indfile, it is necessary in order to support
  -- multiple indices
  par.indfile = nil
end


M.get_utf8 = get_utf8
M.load_enc = load_enc
M.parse_idx = parse_idx
M.fix_idx_pages = fix_idx_pages
M.prepare_idx = prepare_idx
M.process_index = process_index
M.prepare_tmp_idx = prepare_tmp_idx
M.run_indexing_command = run_indexing_command
return M

Reply via email to