Hello all,
I am trying to learn how to parse, modify, and redisplay a Japanese
webpage passed to me in a form and am wondering if anyone has an example
of how to do this.
I looked at htmlget and found that it has a couple problems: namely, it
is not conform to current D2 practices. I am not sure that my hack can
be considered a fix but have attached it nonetheless. It now works
correctly on ascii based urls but not utf-8.
My lack of knowledge on how to properly parsing unicode documents has
left me stumped. I am therefore requesting some assistance in updating
the code such that it works with any url. I have taken a look at std.utf
and there are a few things there that could possibly assist me however
without examples I'm somewhat at a loss.
I'm assuming that the problem exists here:
for (iw = 0; iw != line.length; iw++)
{
if (!icmp("</html>", line[iw .. line.length]))
break print_lines;
}
From what I understanding, one cannot index a utf sequence the same as
you index ASCII. What is the proper what to rewrite this such that it
parses the utf characters correctly? And example would do wonders.
Thanks
/*
HTMLget written by Christopher E. Miller
This code is public domain.
You may use it for any purpose.
This code has no warranties and is provided 'as-is'.
*/
// debug = HTMLGET;
import std.string, std.conv, std.stream;
import std.socket, std.socketstream;
import std.algorithm, std.stdio;
import std.utf;
int main(string[] args)
{
if (args.length < 2)
{
writef("Usage:\n htmlget <web-page>\n");
return 0;
}
string url = args[1];
int i;
i = std.algorithm.countUntil(url, "://");
if (i != -1)
{
if (icmp(url[0 .. i], "http"))
throw new Exception("http:// expected");
}
std.algorithm.findSkip(url, "://");
i = std.algorithm.countUntil(url, '#');
if (i != -1) // Remove anchor ref.
url = url[0 .. i];
i = std.algorithm.countUntil(url, '/');
string domain;
if (i == -1)
{
domain = url;
url = "/";
}
else
{
domain = url[0 .. i];
url = url[i .. url.length];
}
ushort port;
i = std.algorithm.countUntil(domain, ':');
if (i == -1)
{
port = 80; // Default HTTP port.
}
else
{
port = std.conv.to!ushort(domain[i + 1 .. domain.length]);
domain = domain[0 .. i];
}
debug (HTMLGET)
writef("Connecting to " ~ domain ~ " on port " ~
std.string.toString(port) ~ "...\n");
/*auto*/ Socket sock = new TcpSocket(new InternetAddress(domain, port));
Stream ss = new SocketStream(sock);
debug (HTMLGET)
writef("Connected!\nRequesting URL \"" ~ url ~ "\"...\n");
if (port != 80)
domain = domain ~ ":" ~ std.conv.to!string(port);
ss.writeString("GET " ~ url ~ " HTTP/1.1\r\n"
"Host: " ~ domain ~ "\r\n"
"\r\n");
// Skip HTTP header.
char[] line;
for (;;)
{
line = ss.readLine();
if (!line.length)
break;
const string CONTENT_TYPE_NAME = "Content-Type: ";
if (line.length > CONTENT_TYPE_NAME.length &&
!icmp(CONTENT_TYPE_NAME, line[0 .. CONTENT_TYPE_NAME.length]))
{
char[] type;
type = line[CONTENT_TYPE_NAME.length .. line.length];
if (type.length <= 5 || icmp("text/", type[0 .. 5]))
throw new Exception("URL is not text");
}
}
print_lines:
while (!ss.eof())
{
line = ss.readLine();
writef("%s\n", line);
// if(std.string.ifind(line, "</html>") != -1)
// break;
//size_t iw;
foreach (iw, val; line)
{
//iw = toUTFindex(line, iw);
writefln("%d: %s",iw,val);
std.algorithm.findSkip(url, "</html>");
if (!icmp("</html>", line[iw .. line.length]))
break print_lines;
}
}
return 0;
}