diff -Naur htdig-3.1.5.org/htdig/Retriever.cc htdig-3.1.5/htdig/Retriever.cc --- htdig-3.1.5.org/htdig/Retriever.cc Fri Feb 25 03:29:10 2000 +++ htdig-3.1.5/htdig/Retriever.cc Mon Feb 28 22:59:42 2000 @@ -18,6 +18,7 @@ #include #include #include +#include #include "HtWordType.h" static WordList words; @@ -618,6 +619,7 @@ { static Dictionary *invalids = 0; static Dictionary *valids = 0; + static Dictionary *validip = 0; // // Invalid extensions will be kept in a dictionary for quick @@ -661,6 +663,27 @@ } } + // + // Valid IP adress check, build valid IP Dict. + // + if (!validip) + { + // A list of ip adresses, seperated bu spaces or tabs + String t = config["limit_ip"]; + String lowerp; + char *p = strtok(t, " \t"); + validip = new Dictionary; + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + validip->Add(lowerp, 0); + p = strtok(0, " \t"); + } + } + + static String url; url = u; @@ -743,6 +766,46 @@ // if (limits.FindFirst(url) >= 0) return TRUE; + + // + // Check also the IP based limits. + // + char *hostaddr, ipaddr[16]; + int i, len; + len = 0; + struct hostent *ht; + for ( i=7; i < strlen(url); i++) + if (url[i] == '/' || url[i] == ':') + { + len = i - 7; + break; + } + if (!len) + len = strlen(url) - 6; + if ((hostaddr = (char *) malloc((len + 1) *sizeof(char))) == NULL) + return FALSE; + strncpy(hostaddr,url + 7,len); + hostaddr[len] = '\0'; + ht = gethostbyname(hostaddr); + for (i = 0 ; ht->h_addr_list[i] != NULL; i++) + { + sprintf(ipaddr,"%d.%d.%d.%d", + ((unsigned)ht->h_addr_list[i][0]>127) ? ht->h_addr_list[i][0] + 256 : ht->h_addr_list[i][0], + ((unsigned)ht->h_addr_list[i][1]>127) ? ht->h_addr_list[i][1] + 256 : ht->h_addr_list[i][1], + ((unsigned)ht->h_addr_list[i][2]>127) ? ht->h_addr_list[i][2] + 256 : ht->h_addr_list[i][2], + ((unsigned)ht->h_addr_list[i][3]>127) ? ht->h_addr_list[i][3] + 256 : ht->h_addr_list[i][3] + ); + if (debug > 3) + cout << endl <<" Hostname: " << hostaddr << " Resolved to: " << ipaddr; + if (validip->Exists(ipaddr)) + { + if (debug > 2) + cout << endl <<" Accepted: " << ipaddr << " <-> " << hostaddr << endl; + free(hostaddr); + return TRUE; + } + } + free(hostaddr); if (debug > 2) cout << endl <<" Rejected: URL not in the limits!"; diff -Naur htdig-3.1.5.org/installdir/htdig.conf htdig-3.1.5/installdir/htdig.conf --- htdig-3.1.5.org/installdir/htdig.conf Fri Feb 25 03:29:12 2000 +++ htdig-3.1.5/installdir/htdig.conf Mon Feb 28 23:15:40 2000 @@ -40,6 +40,15 @@ limit_urls_to: ${start_url} # +# This attribute limits the scope of the indexing process. It contains +# the ip addresses of the hosts you want to visit. This means every website +# which resolves to the same address will be indexed. The limits of limit_url_to +# and limit_ip are or'ed. +# +limit_ip: 10.0.0.1 + + +# # If there are particular pages that you definately do NOT want to index, you # can use the exclude_urls attribute. The value is a list of string patterns. # If a URL matches any of the patterns, it will NOT be indexed. This is