Hello all,

I am trying to learn how to parse, modify, and redisplay a Japanese webpage passed to me in a form and am wondering if anyone has an example of how to do this.

I looked at htmlget and found that it has a couple problems: namely, it is not conform to current D2 practices. I am not sure that my hack can be considered a fix but have attached it nonetheless. It now works correctly on ascii based urls but not utf-8.

My lack of knowledge on how to properly parsing unicode documents has left me stumped. I am therefore requesting some assistance in updating the code such that it works with any url. I have taken a look at std.utf and there are a few things there that could possibly assist me however without examples I'm somewhat at a loss.

I'm assuming that the problem exists here:

        for (iw = 0; iw != line.length; iw++)
        {
            if (!icmp("</html>", line[iw .. line.length]))
                break print_lines;
        }

From what I understanding, one cannot index a utf sequence the same as you index ASCII. What is the proper what to rewrite this such that it parses the utf characters correctly? And example would do wonders.

Thanks
/*
        HTMLget written by Christopher E. Miller
        This code is public domain.
        You may use it for any purpose.
        This code has no warranties and is provided 'as-is'.
 */

// debug = HTMLGET;

import std.string, std.conv, std.stream;
import std.socket, std.socketstream;
import std.algorithm, std.stdio;
import std.utf;

int main(string[] args)
{
    if (args.length < 2)
    {
        writef("Usage:\n   htmlget <web-page>\n");
        return 0;
    }

    string url = args[1];
    int i;

    i = std.algorithm.countUntil(url, "://");

    if (i != -1)
    {
        if (icmp(url[0 .. i], "http"))
            throw new Exception("http:// expected");
    }

    std.algorithm.findSkip(url, "://");
    i = std.algorithm.countUntil(url, '#');

    if (i != -1)    // Remove anchor ref.
        url = url[0 .. i];

    i = std.algorithm.countUntil(url, '/');
    string domain;

    if (i == -1)
    {
        domain = url;
        url    = "/";
    }
    else
    {
        domain = url[0 .. i];
        url    = url[i .. url.length];
    }

    ushort port;
    i = std.algorithm.countUntil(domain, ':');

    if (i == -1)
    {
        port = 80;         // Default HTTP port.
    }
    else
    {
        port   = std.conv.to!ushort(domain[i + 1 .. domain.length]);
        domain = domain[0 .. i];
    }

    debug (HTMLGET)
        writef("Connecting to " ~ domain ~ " on port " ~ 
std.string.toString(port) ~ "...\n");

    /*auto*/ Socket sock = new TcpSocket(new InternetAddress(domain, port));
    Stream ss        = new SocketStream(sock);

    debug (HTMLGET)
        writef("Connected!\nRequesting URL \"" ~ url ~ "\"...\n");

    if (port != 80)
        domain = domain ~ ":" ~ std.conv.to!string(port);

    ss.writeString("GET " ~ url ~ " HTTP/1.1\r\n"
                   "Host: " ~ domain ~ "\r\n"
                   "\r\n");

    // Skip HTTP header.
    char[] line;

    for (;;)
    {
        line = ss.readLine();

        if (!line.length)
            break;

        const string CONTENT_TYPE_NAME = "Content-Type: ";

        if (line.length > CONTENT_TYPE_NAME.length &&
            !icmp(CONTENT_TYPE_NAME, line[0 .. CONTENT_TYPE_NAME.length]))
        {
            char[] type;
            type = line[CONTENT_TYPE_NAME.length .. line.length];

            if (type.length <= 5 || icmp("text/", type[0 .. 5]))
                throw new Exception("URL is not text");
        }
    }

print_lines:

    while (!ss.eof())
    {
        line = ss.readLine();
        writef("%s\n", line);
        
        // if(std.string.ifind(line, "</html>") != -1)
        //      break;
        //size_t iw;

        foreach (iw, val; line)
        {
            //iw = toUTFindex(line, iw);
            writefln("%d: %s",iw,val);
            std.algorithm.findSkip(url, "</html>");
            if (!icmp("</html>", line[iw .. line.length]))
                break print_lines;
        }
    }

    return 0;
}

Reply via email to