[tex-hyphen] hyphenation for Kazakh

r.ermers--- via tex-hyphen Sun, 08 Jun 2025 01:02:55 -0700

Dear sirs,

As a turcologist doing research on Kazakh, I am trying to set up hyphenation 
patterns for Kazakh in order to use them in LaTeX.


As you may know, Kazakh is an agglutinating Turkic language, related to - among 
others - Turkish, Turkmen, Kyrghyz and Tatar. Unlike Turkmen and Turkish, 
Kazakh is (still) written in the cyrillic script.

With the help of generate-patterns-kk.rb I generated a file hyph-kk.tex, with 
which I would like conduct my first experiments.

One question is whether it is handy to divide vowels into back and front, like 
the developer of the Turkmen patterns has done, or not. A second question is 
how to implement exceptions, e.g. loan-words from Russian. Thirdly, I need 
advice in how to implement the patterns in a Texlive installation.

It goes without saying that I will make the patterns and other files for the 
benefit of future LaTeX users.

Can you advise me how to proceed further?

Kind regards,

Robert Ermers
the Netherlands

#!/usr/bin/env ruby
#
# This script generates hyphenation patterns for Turkmen
#
# This script has been written by Mojca Miklavec <mojca dot miklavec dot lists at gmail dot com>
#[email protected]

# open file for writing the patterns
# $tr = File.new("hyph-tk.tex", "w")
# in TDS
$tr = File.new("hyph-kk.tex", "w")
# $tr = File.new("../../../../../tex/generic/hyph-utf8/patterns/tex/hyph-kk.tex", "w")

# write comments into the file
def add_comment(str)
	$tr.puts "% " + str.gsub(/\n/, "\n% ").gsub(/% \n/, "%\n")
end

# define a class of vowels and consonants
# vowels are split into so that unnecessary permutations are not generated
vowels = %w{Ð° Ð¾ Ò± Ñ Ñ Ñ Ñ Ðµ Ó Ñ Ò¯ Ó© Ñ}
# back_vowels = %w{Ñ Ð° Ð¾ Ò± Ñ}
# front_vowels = %w{Ðµ Ó Ñ Ò¯ Ó© Ñ}
# front_vowels = %w{Ã¤ e i Ã¶ Ã¼}
# back_vowels = %w{a y o u}
consonants = %w{Ð± Ð² Ð³ Ò Ð´ Ð¶ Ð· Ò» Ðº Ò Ð¿ Ñ Ñ Ñ Ñ Ñ Ñ Ñ Ñ Ð¹ Ð» Ð¼ Ð½ Ò£ Ñ Ñ}
# This is to eliminate impossible combinations
common_suffix_consonants = %w{Ñ Ð´ Ñ Ñ Ñ Ðº Ò Ð³ Ò Ð» Ð± Ð¼ Ð¿ Ñ}
# common_suffix_consonants = %w{Ñ Ã§ d g j k l m n p s t Ã½ z Å}


# In Kazakh there are: 7 sonorants: Ð¹, Ð», Ð¼, Ð½, Ò£, Ñ, Ñ 
# 8 voiced consonants: Ð±, Ð²,Ð³, Ò, Ð´, Ð¶, Ð·, Ò» 
# 11 unvoiced consonants: Ðº, Ò, Ð¿, Ñ, Ñ, Ñ, Ñ, Ñ, Ñ, Ñ, Ñ

# Ð Ð°	Ó Ó	Ð Ð±	Ð Ð²	Ð Ð³	Ò Ò	Ð Ð´
# Ð Ðµ	Ð Ñ	Ð Ð¶	Ð Ð·	Ð Ð¸	Ð Ð¹	Ð Ðº
# Ò Ò	Ð Ð»	Ð Ð¼	Ð Ð½	Ò¢ Ò£	Ð Ð¾	Ó¨ Ó©
# Ð Ð¿	Ð  Ñ	Ð¡ Ñ	Ð¢ Ñ	Ð£ Ñ	Ò° Ò±	Ò® Ò¯
# Ð¤ Ñ	Ð¥ Ñ	Òº Ò»	Ð¦ Ñ	Ð§ Ñ	Ð¨ Ñ	Ð© Ñ
# Ðª Ñ	Ð« Ñ	Ð Ñ	Ð¬ Ñ	Ð Ñ	Ð® Ñ	Ð¯ Ñ


# start the file
add_comment(
"Hyphenation patterns for Kazakh (hyph-kk.tex)

Author:  Robert E <>
License: Public domain
Version: 0.1
Date:    7 june 2025

----------------------------------------------------------------------

The file has been auto-generated from generate_patterns_kk.rb
that is part of hyph-utf8.

For more information about UTF-8 hyphenation patterns for TeX and
links to this file see
    http://www.tug.org/tex-hyphen/
")

# we have the following comment for Kazakh:
#
# Some of the patterns below represent combinations that never
# happen in Turkmen. Would they happen, they would be hyphenated
# according to the rules.

$tr.puts '\patterns{'
add_comment("Some suffixes are added through a hyphen. When hyphenating these words, a hyphen is added before the hyphen so that the line ends with a hyphen and the new line starts with a hyphen.")
$tr.puts "1-4"

add_comment("Allow hyphen after a vowel if and only if there is a single consonant before next the vowel")
# front_vowels.each do |v1|
# 	consonants.each do |c|
# 		front_vowels.each do |v2|
# 			$tr.puts "#{v1}1#{c}#{v2}"
# 		end
# 	end
# end

# back_vowels.each do |v1|
# 	consonants.each do |c|
# 		back_vowels.each do |v2|
# 			$tr.puts "#{v1}1#{c}#{v2}"
# 		end
# 	end
# end

vowels.each do |v1|
	consonants.each do |c|
		vowels.each do |v2|
			$tr.puts "#{v1}1#{c}#{v2}"
		end
	end
end

add_comment("These combinations occur in words of foreign origin or joined words")
consonants.each do |c|
  	$tr.puts "Ð°1#{c}Ñ"
  	# $tr.puts "a1#{c}e"
	# $tr.puts "y1#{c}Ã¤"
	# $tr.puts "y1#{c}i"
	# $tr.puts "y1#{c}e"
	# $tr.puts "o1#{c}i"
	# $tr.puts "o1#{c}e"
	# $tr.puts "u1#{c}i"
	# $tr.puts "u1#{c}e"
	# $tr.puts "i1#{c}a"
	# $tr.puts "i1#{c}o"
	# $tr.puts "e1#{c}a"
	# $tr.puts "e1#{c}o"
	# $tr.puts "Ã¤1#{c}o"
	# $tr.puts "Ã¤1#{c}a"
	# $tr.puts "Ã¶1#{c}a"
end

add_comment("Allow hyphen between two consonants (if there is only two of them), except when they are at the begining of the word")
consonants.each do |c1|
	consonants.each do |c2|
		$tr.puts "#{c1}1#{c2}"
		$tr.puts ".#{c1}2#{c2}"
	end
end

add_comment("Patterns for triple consonants. There may be additions to this category, as this list is not exhaustive.")
common_suffix_consonants.each do |c|
	$tr.puts "Ã½2t1#{c}"
	# $tr.puts "Ã½2n1#{c}"
	# $tr.puts "Ã½2d1#{c}"
	# $tr.puts "r2t1#{c}"
	# $tr.puts "Ã½2p1#{c}"
	# $tr.puts "l2p1#{c}"
	# $tr.puts "l2t1#{c}"
	# $tr.puts "g2t1#{c}"
	# $tr.puts "n2t1#{c}"
	# $tr.puts "r2k1#{c}"
	# $tr.puts "r2p1#{c}"
	# $tr.puts "k2t1#{c}"
	# $tr.puts "r2h1#{c}"
	# $tr.puts "s2t1#{c}"
	# $tr.puts "l2k1#{c}"
	# $tr.puts "w2p1#{c}"
	# $tr.puts "n2s1#{c}"
	# $tr.puts "r2s1#{c}"
	# $tr.puts "l2m1#{c}"
end

add_comment("Exceptions and single word occurence patterns for words of foreign origin i.e. Russian")
$tr.puts "s2k1d"
$tr.puts "l1s2k"
$tr.puts "l1s2t"
$tr.puts "s1t2r"
$tr.puts "n2g1l"
$tr.puts "n1g2r"
$tr.puts "s2k1w"

# end the file
$tr.puts '}'
$tr.close

[tex-hyphen] hyphenation for Kazakh

Reply via email to