D1: UTF8 char[] casting to wchar[] array cast misalignment ERROR

jicman via Digitalmars-d-learn Mon, 16 Jun 2014 19:31:22 -0700


Greetings!

I have a bunch of files plain ASCII, UTF8 and UTF16 with andwithout BOM (Byte Order Mark). I had, "I thought", a nice way offiguring out what type of encoding the file was (ASCII, UTF8 orUTF16) when the BOM was missing, by reading the content andapplying the std.utf.validate function to the char[] or, wchar[]string. The problem is that lately, I am hitting into a wallwith the "array cast misalignment" when casting wchar[]. ie.


auto text = cast(string) file.read();
wchar[] temp = cast(wchar[]) text;

What would be the correct process to find out a text fileencoding?

Any help would be greatly appreciated. This is the code that Ihave right now...


//begin code
char[] ReadFileData2UTF8(char[] file, out char[] bom)
{
  auto text = cast(string) file.read();
  if (text.length == 0)
  {
    bom = "NO_BOM";
    return "";
  }
  else if (text.length == 1)
  {
    ubyte[1] b = cast(ubyte[]) text[0 .. 1];
    bom = getBOM(b);
  }
  else if (text.length == 2)
  {
    ubyte[2] b = cast(ubyte[]) text[0 .. 2];
    bom = getBOM(b);
  }
  else if (text.length == 3)
  {
    ubyte[3] b = cast(ubyte[]) text[0 .. 3];
    bom = getBOM(b);
  }
  else if (text.length > 3)
  {
    ubyte[4] b = cast(ubyte[]) text[0 .. 4];
    bom = getBOM(b);
  }
  //writefln(bom);
  if (std.string.find(bom, "UTF16") == 0)
  {
    ubyte[] bs = cast(ubyte[]) text;
    if (bs[0 .. 2] == UTF16_be || bs[0 .. 2] == UTF16_le)
      bs = bs[2 .. $];
    text = cast(char[]) bs;
    wchar[] temp = cast(wchar[]) text; //text[2 .. $];
    text = std.utf.toUTF8(temp);
  }
  else if (std.string.find(bom, "UTF32") == 0)
  {
    ubyte[] bs = cast(ubyte[]) text;
    if (bs[0 .. 4] == UTF32_be || bs[0 .. 4] == UTF32_le)
      bs = bs[4 .. $];
    text = cast(char[]) bs;
    dchar[] temp = cast(dchar[]) text; //text[2 .. $];
    text = std.utf.toUTF8(temp);
  }
  else if (bom == "UTF8")
  {
    ubyte[] bs = cast(ubyte[]) text;
    if (bs[0 .. 3] == UTF8)
      bs = bs[3 .. $];
    text = cast(char[]) bs;
    // text is already UTF8
  }
  else // hopeing I can figure out the type...
  {
    //msgBox("No BOM");
    //ubyte[] bs = cast(ubyte[]) text;
    try // utf8
    {
      validate(text);
      bom = "UTF8";
    }
    catch (UtfException e)
    {
      //msgBox("Failed UTF8. Trying UTF16");
      //text = cast(char[]) bs;
      //if ((text.length % 2) == 1)
      //  text ~= " ";
      try //utf16
      {
        wchar[] temp = cast(wchar[]) text; //text[2 .. $];
        //wchar[] temp = std.utf.toUTF16(text); //text[2 .. $];
        validate(temp);
        text = std.utf.toUTF8(temp);
        bom = "UTF16_le";
      }
      catch (UtfException e)
      {
        //msgBox("Failed UTF16. Trying UTF32");
        //text = cast(char[]) bs;
        try // utf32
        {
          dchar[] temp = cast(dchar[]) text; //text[2 .. $];
          //dchar[] temp = std.utf.toUTF32(text); //text[2 .. $];
          validate(temp);
          text = std.utf.toUTF8(temp);
          bom = "UTF32_le";
        }
        catch (UtfException e) // hoping for ASCII
        {
          //msgBox("Failed UTF32. Hoping ASCII");
          text ~= "\000";
          char[] temp = std.windows.charset.fromMBSz(text.ptr,0);
          text = std.utf.toUTF8(temp);
          //text = temp;
          bom = "NO_BOM";
        }
      }
    }
  }
  return text;
}
//end code

D1: UTF8 char[] casting to wchar[] array cast misalignment ERROR

Reply via email to