Commit 205fafa5 authored by Simon Kelley's avatar Simon Kelley

Improve performance when reading large hostfiles.

parent 8ecfaa4a
...@@ -635,11 +635,12 @@ struct crec *cache_find_by_addr(struct crec *crecp, struct all_addr *addr, ...@@ -635,11 +635,12 @@ struct crec *cache_find_by_addr(struct crec *crecp, struct all_addr *addr,
} }
static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrlen, static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrlen,
unsigned short flags, int index, int addr_dup) unsigned short flags, int index, struct crec **rhash)
{ {
struct crec *lookup = cache_find_by_name(NULL, cache->name.sname, 0, flags & (F_IPV4 | F_IPV6)); struct crec *lookup = cache_find_by_name(NULL, cache->name.sname, 0, flags & (F_IPV4 | F_IPV6));
int i, nameexists = 0; int i, nameexists = 0;
struct cname *a; struct cname *a;
unsigned int j;
/* Remove duplicates in hosts files. */ /* Remove duplicates in hosts files. */
if (lookup && (lookup->flags & F_HOSTS)) if (lookup && (lookup->flags & F_HOSTS))
...@@ -653,33 +654,41 @@ static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrl ...@@ -653,33 +654,41 @@ static void add_hosts_entry(struct crec *cache, struct all_addr *addr, int addrl
} }
/* Ensure there is only one address -> name mapping (first one trumps) /* Ensure there is only one address -> name mapping (first one trumps)
We do this by steam here, first we see if the address is the same as We do this by steam here, The entries are kept in hash chains, linked
the last one we saw, which eliminates most in the case of an ad-block by ->next (which is unused at this point) held in hash buckets in
file with thousands of entries for the same address. the array rhash. Note that rhash and the values in ->next are only valid
Then we search and bail at the first matching address that came from whilst reading hosts files: the buckets are then freed, and the
->next pointer used for other things.
We search and bail at the first matching address that came from
a HOSTS file. Since the first host entry gets reverse, we know a HOSTS file. Since the first host entry gets reverse, we know
then that it must exist without searching exhaustively for it. */ then that it must exist without searching exhaustively for it.
This complexity avoids O(n^2) divergent CPU use whilst reading
large (10000 entry) hosts files. */
if (addr_dup) /* hash address */
flags &= ~F_REVERSE; for (j = 0, i = 0; i < addrlen; i++)
else j += ((unsigned char *)addr)[i] + (j << 6) + (j << 16) - j;
for (i=0; i<hash_size; i++)
for (lookup = rhash[j % RHASHSIZE]; lookup; lookup = lookup->next)
if ((lookup->flags & F_HOSTS) &&
(lookup->flags & flags & (F_IPV4 | F_IPV6)) &&
memcmp(&lookup->addr.addr, addr, addrlen) == 0)
{ {
for (lookup = hash_table[i]; lookup; lookup = lookup->hash_next) flags &= ~F_REVERSE;
if ((lookup->flags & F_HOSTS) && break;
(lookup->flags & flags & (F_IPV4 | F_IPV6)) &&
memcmp(&lookup->addr.addr, addr, addrlen) == 0)
{
flags &= ~F_REVERSE;
break;
}
if (lookup)
break;
} }
cache->flags = flags; cache->flags = flags;
cache->uid = index; cache->uid = index;
/* maintain address has chain */
cache->next = rhash[j % RHASHSIZE];
rhash[j % RHASHSIZE] = cache;
memcpy(&cache->addr.addr, addr, addrlen); memcpy(&cache->addr.addr, addr, addrlen);
cache_hash(cache); cache_hash(cache);
/* don't need to do alias stuff for second and subsequent addresses. */ /* don't need to do alias stuff for second and subsequent addresses. */
...@@ -743,14 +752,14 @@ static int gettok(FILE *f, char *token) ...@@ -743,14 +752,14 @@ static int gettok(FILE *f, char *token)
} }
} }
static int read_hostsfile(char *filename, int index, int cache_size) static int read_hostsfile(char *filename, int index, int cache_size, struct crec **rhash)
{ {
FILE *f = fopen(filename, "r"); FILE *f = fopen(filename, "r");
char *token = daemon->namebuff, *domain_suffix = NULL; char *token = daemon->namebuff, *domain_suffix = NULL;
int addr_count = 0, name_count = cache_size, lineno = 0; int addr_count = 0, name_count = cache_size, lineno = 0;
unsigned short flags = 0, saved_flags = 0; unsigned short flags = 0;
struct all_addr addr, saved_addr; struct all_addr addr;
int atnl, addrlen = 0, addr_dup; int atnl, addrlen = 0;
if (!f) if (!f)
{ {
...@@ -762,7 +771,6 @@ static int read_hostsfile(char *filename, int index, int cache_size) ...@@ -762,7 +771,6 @@ static int read_hostsfile(char *filename, int index, int cache_size)
while ((atnl = gettok(f, token)) != EOF) while ((atnl = gettok(f, token)) != EOF)
{ {
addr_dup = 0;
lineno++; lineno++;
#ifdef HAVE_IPV6 #ifdef HAVE_IPV6
...@@ -794,14 +802,6 @@ static int read_hostsfile(char *filename, int index, int cache_size) ...@@ -794,14 +802,6 @@ static int read_hostsfile(char *filename, int index, int cache_size)
continue; continue;
} }
if (saved_flags == flags && memcmp(&addr, &saved_addr, addrlen) == 0)
addr_dup = 1;
else
{
saved_flags = flags;
saved_addr = addr;
}
addr_count++; addr_count++;
/* rehash every 1000 names. */ /* rehash every 1000 names. */
...@@ -832,14 +832,13 @@ static int read_hostsfile(char *filename, int index, int cache_size) ...@@ -832,14 +832,13 @@ static int read_hostsfile(char *filename, int index, int cache_size)
strcpy(cache->name.sname, canon); strcpy(cache->name.sname, canon);
strcat(cache->name.sname, "."); strcat(cache->name.sname, ".");
strcat(cache->name.sname, domain_suffix); strcat(cache->name.sname, domain_suffix);
add_hosts_entry(cache, &addr, addrlen, flags, index, addr_dup); add_hosts_entry(cache, &addr, addrlen, flags, index, rhash);
addr_dup = 1;
name_count++; name_count++;
} }
if ((cache = whine_malloc(sizeof(struct crec) + strlen(canon)+1-SMALLDNAME))) if ((cache = whine_malloc(sizeof(struct crec) + strlen(canon)+1-SMALLDNAME)))
{ {
strcpy(cache->name.sname, canon); strcpy(cache->name.sname, canon);
add_hosts_entry(cache, &addr, addrlen, flags, index, addr_dup); add_hosts_entry(cache, &addr, addrlen, flags, index, rhash);
name_count++; name_count++;
} }
free(canon); free(canon);
...@@ -863,6 +862,7 @@ void cache_reload(void) ...@@ -863,6 +862,7 @@ void cache_reload(void)
struct crec *cache, **up, *tmp; struct crec *cache, **up, *tmp;
int i, total_size = daemon->cachesize; int i, total_size = daemon->cachesize;
struct hostsfile *ah; struct hostsfile *ah;
struct crec **reverse_hash;
cache_inserted = cache_live_freed = 0; cache_inserted = cache_live_freed = 0;
...@@ -895,14 +895,22 @@ void cache_reload(void) ...@@ -895,14 +895,22 @@ void cache_reload(void)
my_syslog(LOG_INFO, _("cleared cache")); my_syslog(LOG_INFO, _("cleared cache"));
return; return;
} }
if (!(reverse_hash = whine_malloc(sizeof(struct crec *) * RHASHSIZE)))
return;
for (i = 0; i < RHASHSIZE; i++)
reverse_hash[i] = NULL;
if (!option_bool(OPT_NO_HOSTS)) if (!option_bool(OPT_NO_HOSTS))
total_size = read_hostsfile(HOSTSFILE, 0, total_size); total_size = read_hostsfile(HOSTSFILE, 0, total_size, reverse_hash);
daemon->addn_hosts = expand_filelist(daemon->addn_hosts); daemon->addn_hosts = expand_filelist(daemon->addn_hosts);
for (ah = daemon->addn_hosts; ah; ah = ah->next) for (ah = daemon->addn_hosts; ah; ah = ah->next)
if (!(ah->flags & AH_INACTIVE)) if (!(ah->flags & AH_INACTIVE))
total_size = read_hostsfile(ah->fname, ah->index, total_size); total_size = read_hostsfile(ah->fname, ah->index, total_size, reverse_hash);
free(reverse_hash);
} }
char *get_domain(struct in_addr addr) char *get_domain(struct in_addr addr)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#define FTABSIZ 150 /* max number of outstanding requests (default) */ #define FTABSIZ 150 /* max number of outstanding requests (default) */
#define RHASHSIZE 1024 /* hash buckets for address lookup during hostfile read */
#define MAX_PROCS 20 /* max no children for TCP requests */ #define MAX_PROCS 20 /* max no children for TCP requests */
#define CHILD_LIFETIME 150 /* secs 'till terminated (RFC1035 suggests > 120s) */ #define CHILD_LIFETIME 150 /* secs 'till terminated (RFC1035 suggests > 120s) */
#define EDNS_PKTSZ 4096 /* default max EDNS.0 UDP packet from RFC5625 */ #define EDNS_PKTSZ 4096 /* default max EDNS.0 UDP packet from RFC5625 */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment