#!/usr/bin/perl -w
#
# Convert Devnagari Unicode text to ITRANS-like phonetic mapping.
#
# Shamelessly adapted from the JS version at
# http://www.hindidevanagari.com/transliteration/toLatin.js
#
# Copyright 2012, Raj Mathur <raju@kandalaya.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License; either
# version 3, or (at your option) any later version.
#
use strict;

use utf8;
use Encode;
use Getopt::Long;
use Pod::Usage;
################################################################
#
# Conversion tables
#
################################################################
#
# Straightforward.
my
  %table_digits =
  (
   '०' => '0',
   '१' => '1',
   '२' => '2',
   '३' => '3',
   '४' => '4',
   '५' => '5',
   '६' => '6',
   '७' => '7',
   '८' => '8',
   '९' => '9',
  );
#
# Swaras and matras
my
  %table_swar =
  (
   'ॐ' => 'OmSymbol',
   'ः' => 'H',
   'अ' => 'A',
   'आ' => 'AA',
   'ऐ' => 'AI',
   'औ' => 'AO',
   'ऍ' => 'AE',
   'ऑ' => 'AW',
   'इ' => 'I',
   'ई' => 'EE',
   'ए' => 'E',
   'उ' => 'U',
   'ऊ' => 'OO',
   'ओ' => 'O',
   'ऋ' => 'RI',
   'ृ' => 'Ri',
   'र्‍' => 'R',
   'ा' => 'aa',
   'ै' => 'ai',
   'ौ' => 'ao',
   'ॅ' => 'ae',
   'ॉ' => 'aw',
   'ि' => 'i',
   'ी' => 'ee',
   'े' => 'e',
   'ु' => 'u',
   'ू' => 'oo',
   'ो' => 'o',
  );
#
# Complex consonants that have to be handled before simple ones.
# Mostly those with nuktas or conjuncts.
my
  %table_complex_vyanjan =
  (
   'ख़' => 'Kh',
   'क़' => 'Q',
   'ग़' => 'G',
   'ज़' => 'Z',
   'ज्ञ' => 'gNy',
   'ढ़' => 'Ddh',
   'ड़' => 'Dd',
   'फ़' => 'f',
  );
my
  %table_vyanjan =
  (
   'ख' => 'kh',
   'क' => 'k',
   'घ' => 'gh',
   'ग' => 'g',
   'छ' => 'chh',
   'च' => 'ch',
   'झ' => 'jh',
   'ज' => 'j',
   'ठ' => 'Th',
   'ट' => 'T',
   'ढ' => 'Dh',
   'ड' => 'D',
   'थ' => 'th',
   'त' => 't',
   'ध' => 'dh',
   'द' => 'd',
   'फ' => 'ph',
   'प' => 'p',
   'भ' => 'bh',
   'ब' => 'b',
   'य' => 'y',
   'र' => 'r',
   'ल' => 'l',
   'व' => 'v',
   'श' => 'sh',
   'स' => 's',
   'ष' => 'Sh',
   'ह' => 'h',
   'म' => 'm',
   'न' => 'n',
   'ङ' => 'NG',
   'ञ' => 'NJ',
   'ण' => 'NN',
   'ढ़' => 'Ddh',
   'ड़' => 'Dd',
   'ज़' => 'J',
   'फ़' => 'F',
   'ख़' => 'Kh',
   'ग़' => 'G',
   'क़' => 'Q',
   'ळ' => 'L',
  );
#
# Special marks, whatever they're called.  Treated like vyanjans, but
# no following 'a' needed.
my
  %table_special_marks =
  (
   'ँ' => 'Nn',
   'ं' => 'N',
   '।' => '.',
  );
################################################################
#
# Main
#
################################################################
#
# Options
my
  $output_file = '-';
my
  $help = undef;
my
  $result = GetOptions
  (
   'output|o=s' => \$output_file,
   'help|h' => \$help,
  );
pod2usage(-exitval=>0, -verbose=>2)
  if $help;
pod2usage(-exitval=>2, -verbose=>1)
  unless $result;
#
# Open files and transliterate.  Assume inputs only contain Unicode
# text.
my
  $OF;
if( $output_file eq '-' )
{
  $OF = \*STDOUT;
}
else
{
  open($OF, ">$output_file")
    or die "$0: unable to open $output_file for writing: $!\n";
}
my
  @infiles = qw/-/;
@infiles = @ARGV
  if @ARGV;
foreach my $f( @infiles )
{
  my
    $IF;
  if( $f eq '-' )
  {
    $IF = \*STDIN;
  }
  else
  {
    unless( open($IF, "<$f" ) )
    {
      print STDERR "$0: unable to open $f for reading: $!\n";
      next;
    }
  }
  while( my $l = <$IF> )
  {
    print $OF unicode_to_itrans(decode('utf8', $l));
  }
  close($IF)
    unless $f eq '-';
}
close($OF);
exit( 0 );
################################################################
#
# unicode_to_itrans
#
# Convert Unicode input using ITRANS phonetic transliteration.
#
################################################################
my
  $matra_string = undef;
sub unicode_to_itrans
{
  my
    $l = shift;
  unless( $matra_string )
  {
    $matra_string = 'ृाैौॅॉिीेुूो';
  }
  my
    $t = '';
  my
    @char = split(//, $l);
  for ( my $i = 0; $i <= $#char; $i++ )
  {
    my
      $increment = 0;
    my
      $c = $char[$i];
    my
      $replace = undef;
    my
      $add_a = 0;
    if( $table_vyanjan{$c} )
    {
      #
      # Special handling for conjuncts.
      if( $c eq 'ज' && $char[$i+1] eq '़' )
      {
        $replace = 'z';
        $increment = 1;
      }
      elsif( $c eq 'ज' && $char[$i+1] eq '्' && $char[$i+2] eq 'ञ' )
      {
        $replace = 'gNy';
        $increment = 2;
      }
      elsif( $c eq 'ख' && $char[$i+1] eq '़' )
      {
        $replace = 'Kh';
        $increment = 1;
      }
      elsif( $c eq 'क' && $char[$i+1] eq '़' )
      {
        $replace = 'Q';
        $increment = 1;
      }
      elsif( $c eq 'ग' && $char[$i+1] eq '़' )
      {
        $replace = 'G';
        $increment = 1;
      }
      elsif( $c eq 'फ्' && $char[$i+1] eq '़' )
      {
        $replace = 'f';
        $increment = 1;
      }
      # The previous case should handle this, but (e.g.) the Hindi
      # Wikipedia Hindi page contains some weird-ass encoding for f
      # (as in farsi), which I've copied and pasted here.  YOU figure
      # out which Unicode code point it is!
      elsif( $c eq 'फ' && $char[$i+1] eq '़' )
      {
        $replace = 'f';
        $increment = 1;
      }
      else
      {
        $replace = $table_vyanjan{$c};
      }
      $add_a = 1;
    }
    elsif( $table_complex_vyanjan{$c} )
    {
      $replace = $table_complex_vyanjan{$c};
      $add_a = 1;
    }
    elsif( $table_swar{$c} )
    {
      $replace = $table_swar{$c};
    }
    elsif( $table_digits{$c} )
    {
      $replace = $table_digits{$c};
    }
    elsif( $table_special_marks{$c} )
    {
      $replace = $table_special_marks{$c};
    }
    else
    {
      $replace = $c;
    }
    #
    # Remove extra a from vyanjans followed by a matra.
    if( $add_a )
    {
      my
        $next_char = $char[$i+1];
      $next_char = $char[$i+2]
        if $next_char eq '़';
      if( $next_char =~ /[\Q$matra_string\E]/ )
      {
        $add_a = 0;
      }
      elsif( $next_char =~ /्/ )
      {
        $add_a = 0;
        $increment++;
      }
      elsif( $next_char =~ /\W/ )
      {
        $add_a = 0
          unless $c eq 'य' || $c eq 'र';
      }
    }
    $t .= $replace;
    $t .= 'a'
      if $add_a;
    $i += $increment;
  }
  return( $t );
}

=pod

=head1 NAME

devnagari-to-itrans - transliterate Devnagari (Unicode) text to
something resembling ITRANS.

=head1 SYNOPSIS

perl devnagari-to-itrans.pl [-o|--output file] [-h|--help] file...

=head1 DESCRIPTION

Transliterate most Unicode Devnagari to ITRANS-like phonetic Roman
script.

=head1 AUTHOR

devnagari-to-itrans is written by Raj Mathur
E<lt>raju@kandalaya.orgE<gt>.

=head1 LICENCE

devnagari-to-itrans is available under the terms of the GNU General
Public Licence version 3 or, at your option, any later version.

=cut

__END__;
