当前位置: 首页 > Java > 正文

字符串Hash函数–c语言实现版

关键字:
1 星2 星3 星4 星5 星 (暂无评分)
Loading ... Loading ...
baidu_share

hash查找因为其O(1)的查找性能而著称,被对查找性能要求高的应用所广泛采用。它的基本思想是:
(1) 创建一个定长的线性Hash表,一般可以初始化时指定length;

(2) 设计Hash函数,将关键字key散射到Hash表中。其中hash函数设计是最为关键的,均匀分布、冲突概率小全在它;

(3) 通常采用拉链方法来解决hash冲突问题,即散射到同一个hash表项的关键字,以链表形式来表示(也称为桶backet);

(4) 给定关键字key,就可以在O(1) + O(m)的时间复杂度内定位到目标。其中,m为拉链长度,即桶深。

Hash应用中,字符串是最为常见的关键字,应用非常普通,现在的程序设计语言中基本上都提供了字符串hash表的支持。字符串hash函数非常多,常见的主要有Simple_hash, RS_hash, JS_hash, PJW_hash, ELF_hash, BKDR_hash, SDBM_hash, DJB_hash, AP_hash, CRC_hash等。它们的C语言实现见后面附录代码: hash.h, hash.c。那么这么些字符串hash函数,谁好熟非呢?评估hash函数优劣的基准主要有以下两个指标:

(1) 散列分布性

即桶的使用率backet_usage = (已使用桶数) / (总的桶数),这个比例越高,说明分布性良好,是好的hash设计。

(2) 平均桶长

即avg_backet_len,所有已使用桶的平均长度。理想状态下这个值应该=1,越小说明冲突发生地越少,是好的hash设计。

hash函数计算一般都非常简洁,因此在耗费计算时间复杂性方面判别甚微,这里不作对比。

评估方案设计是这样的:

(1) 以200M的视频文件作为输入源,以4KB的块为大小计算MD5值,并以此作为hash关键字;

(2) 分别应用上面提到的各种字符串hash函数,进行hash散列模拟;

(3) 统计结果,用散列分布性和平均桶长两个指标进行评估分析。

测试程序见附录代码hashtest.c,测试结果如下表所示。从这个结果我们也可以看出,这些字符串hash函数真是不相仲伯,难以决出高低,所以实际应用中可以根据喜好选择。当然,最好实际测试一下,毕竟应用特点不大相同。其他几组测试结果也类似,这里不再给出。

1
2
3
4
5
6
7
8
9
10
11
Hash函数	桶数	Hash调用总数	最大桶长	平均桶长	桶使用率%
simple_hash	10240	47198	16	4.63	99.00%
RS_hash	10240	47198	16	4.63	98.91%
JS_hash	10240	47198	15	4.64	98.87%
PJW_hash	10240	47198	16	4.63	99.00%
ELF_hash	10240	47198	16	4.63	99.00%
BKDR_hash	10240	47198	16	4.63	99.00%
SDBM_hash	10240	47198	16	4.63	98.90%
DJB_hash	10240	47198	15	4.64	98.85%
AP_hash	10240	47198	16	4.63	98.96%
CRC_hash	10240	47198	16	4.64	98.77%

附录源代码:
hash.h

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#ifndef _HASH_H  
#define _HASH_H  
 
#ifdef __cplusplus  
extern "C" {  
#endif  
 
/* A Simple Hash Function */  
unsigned int simple_hash(char *str);  
 
/* RS Hash Function */  
unsigned int RS_hash(char *str);  
 
/* JS Hash Function */  
unsigned int JS_hash(char *str);  
 
/* P. J. Weinberger Hash Function */  
unsigned int PJW_hash(char *str);  
 
/* ELF Hash Function */  
unsigned int ELF_hash(char *str);  
 
/* BKDR Hash Function */  
unsigned int BKDR_hash(char *str);  
 
/* SDBM Hash Function */  
unsigned int SDBM_hash(char *str);  
 
/* DJB Hash Function */  
unsigned int DJB_hash(char *str);  
 
/* AP Hash Function */  
unsigned int AP_hash(char *str);  
 
/* CRC Hash Function */  
unsigned int CRC_hash(char *str);  
 
#ifdef __cplusplus  
}  
#endif  
 
#endif

hash.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#include <string.h>  
#include "hash.h"  
 
/* A Simple Hash Function */  
unsigned int simple_hash(char *str)  
{  
    register unsigned int hash;  
    register unsigned char *p;  
 
    for(hash = 0, p = (unsigned char *)str; *p ; p++)  
        hash = 31 * hash + *p;  
 
    return (hash & 0x7FFFFFFF);  
}  
 
/* RS Hash Function */  
unsigned int RS_hash(char *str)  
{  
         unsigned int b = 378551;  
         unsigned int a = 63689;  
         unsigned int hash = 0;  
 
         while (*str)  
         {  
                 hash = hash * a + (*str++);  
                 a *= b;  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* JS Hash Function */  
unsigned int JS_hash(char *str)  
{  
         unsigned int hash = 1315423911;  
 
         while (*str)  
         {  
                 hash ^= ((hash << 5) + (*str++) + (hash >> 2));  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* P. J. Weinberger Hash Function */  
unsigned int PJW_hash(char *str)  
{  
         unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);  
         unsigned int ThreeQuarters     = (unsigned int)((BitsInUnignedInt   * 3) / 4);  
         unsigned int OneEighth         = (unsigned int)(BitsInUnignedInt / 8);  
 
         unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);  
         unsigned int hash              = 0;  
         unsigned int test              = 0;  
 
         while (*str)  
         {  
                 hash = (hash << OneEighth) + (*str++);  
                 if ((test = hash & HighBits) != 0)  
                 {  
                         hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));  
                 }  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* ELF Hash Function */  
unsigned int ELF_hash(char *str)  
{  
         unsigned int hash = 0;  
         unsigned int x     = 0;  
 
         while (*str)  
         {  
                 hash = (hash << 4) + (*str++);  
                 if ((x = hash & 0xF0000000L) != 0)  
                 {  
                         hash ^= (x >> 24);  
                         hash &= ~x;  
                 }  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* BKDR Hash Function */  
unsigned int BKDR_hash(char *str)  
{  
         unsigned int seed = 131; // 31 131 1313 13131 131313 etc..  
         unsigned int hash = 0;  
 
         while (*str)  
         {  
                 hash = hash * seed + (*str++);  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* SDBM Hash Function */  
unsigned int SDBM_hash(char *str)  
{  
         unsigned int hash = 0;  
 
         while (*str)  
         {  
                 hash = (*str++) + (hash << 6) + (hash << 16) - hash;  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* DJB Hash Function */  
unsigned int DJB_hash(char *str)  
{  
         unsigned int hash = 5381;  
 
         while (*str)  
         {  
                 hash += (hash << 5) + (*str++);  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* AP Hash Function */  
unsigned int AP_hash(char *str)  
{  
         unsigned int hash = 0;  
         int i;  
         for (i=0; *str; i++)  
         {  
                 if ((i & 1) == 0)  
                 {  
                         hash ^= ((hash << 7) ^ (*str++) ^ (hash >> 3));  
                 }  
                 else  
                 {  
                         hash ^= (~((hash << 11) ^ (*str++) ^ (hash >> 5)));  
                 }  
         }  
 
         return (hash & 0x7FFFFFFF);  
}  
 
/* CRC Hash Function */  
unsigned int CRC_hash(char *str)  
{  
    unsigned int        nleft   = strlen(str);  
    unsigned long long  sum     = 0;  
    unsigned short int *w       = (unsigned short int *)str;  
    unsigned short int  answer  = 0;  
 
    /* 
     * Our algorithm is simple, using a 32 bit accumulator (sum), we add 
     * sequential 16 bit words to it, and at the end, fold back all the 
     * carry bits from the top 16 bits into the lower 16 bits. 
     */  
    while ( nleft > 1 ) {  
        sum += *w++;  
        nleft -= 2;  
    }  
    /* 
     * mop up an odd byte, if necessary 
     */  
    if ( 1 == nleft ) {  
        *( unsigned char * )( &answer ) = *( unsigned char * )w ;  
        sum += answer;  
    }  
    /* 
     * add back carry outs from top 16 bits to low 16 bits 
     * add hi 16 to low 16 
     */  
    sum = ( sum >> 16 ) + ( sum & 0xFFFF );  
    /* add carry */  
    sum += ( sum >> 16 );  
    /* truncate to 16 bits */  
    answer = ~sum;  
 
    return (answer & 0xFFFFFFFF);  
}

hashtest.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#include <stdio.h>  
#include <stdlib.h>  
#include <sys/types.h>  
#include <sys/stat.h>  
#include <fcntl.h>  
#include <errno.h>  
#include <string.h>  
#include "hash.h"  
#include "md5.h"  
 
struct hash_key {  
    unsigned char *key;  
    struct hash_key *next;   
};  
 
struct hash_counter_entry {  
    unsigned int hit_count;  
    unsigned int entry_count;  
    struct hash_key *keys;  
};  
 
#define BLOCK_LEN   4096  
 
static int backet_len = 10240;  
static int hash_call_count = 0;  
static struct hash_counter_entry *hlist = NULL;  
unsigned int (*hash_func)(char *str);  
 
void choose_hash_func(char *hash_func_name)  
{  
    if (0 == strcmp(hash_func_name, "simple_hash"))  
        hash_func = simple_hash;  
    else if (0 == strcmp(hash_func_name, "RS_hash"))  
        hash_func = RS_hash;  
    else if (0 == strcmp(hash_func_name, "JS_hash"))  
        hash_func = JS_hash;  
    else if (0 == strcmp(hash_func_name, "PJW_hash"))  
        hash_func = PJW_hash;  
    else if (0 == strcmp(hash_func_name, "ELF_hash"))  
        hash_func = ELF_hash;  
    else if (0 == strcmp(hash_func_name, "BKDR_hash"))  
        hash_func = BKDR_hash;  
    else if (0 == strcmp(hash_func_name, "SDBM_hash"))  
        hash_func = SDBM_hash;  
    else if (0 == strcmp(hash_func_name, "DJB_hash"))  
        hash_func = DJB_hash;  
    else if (0 == strcmp(hash_func_name, "AP_hash"))  
        hash_func = AP_hash;  
    else if (0 == strcmp(hash_func_name, "CRC_hash"))  
        hash_func = CRC_hash;  
    else  
        hash_func = NULL;  
}  
 
void insert_hash_entry(unsigned char *key, struct hash_counter_entry *hlist)  
{  
    unsigned int hash_value = hash_func(key) % backet_len;  
    struct hash_key *p;  
 
    p = hlist[hash_value].keys;  
    while(p) {  
        if (0 == strcmp(key, p->key))  
            break;  
        p = p->next;  
    }  
    if (p == NULL)  
    {  
        p = (struct hash_key *)malloc(sizeof(struct hash_key));  
        if (p == NULL)   
        {  
            perror("malloc in insert_hash_entry");  
            return;  
        }  
        p->key = strdup(key);  
        p->next = hlist[hash_value].keys;  
        hlist[hash_value].keys = p;  
        hlist[hash_value].entry_count++;  
    }  
    hlist[hash_value].hit_count++;  
}  
 
void hashtest_init()  
{  
    int i;  
 
    hash_call_count = 0;  
    hlist = (struct hash_counter_entry *) malloc (sizeof(struct hash_counter_entry) *  backet_len);  
    if (NULL == hlist)  
    {  
        perror("malloc in hashtest_init");  
        return;  
    }  
    for (i = 0; i < backet_len; i++)  
    {  
        hlist[i].hit_count = 0;  
        hlist[i].entry_count = 0;  
        hlist[i].keys = NULL;  
    }  
}  
 
void hashtest_clean()  
{  
    int i;  
    struct hash_key *pentry, *p;  
 
    if (NULL == hlist)  
        return;  
 
    for (i = 0; i < backet_len; ++i)  
    {  
        pentry = hlist[i].keys;  
        while(pentry)  
        {  
            p = pentry->next;  
            if (pentry->key) free(pentry->key);  
            free(pentry);  
            pentry = p;  
        }  
    }  
    free(hlist);  
}  
 
void show_hashtest_result()  
{  
    int i, backet = 0, max_link = 0, sum = 0;  
    int conflict_count = 0, hit_count = 0;  
    float avg_link, backet_usage;  
 
    for(i = 0; i < backet_len; i++)  
    {  
        if (hlist[i].hit_count > 0)   
        {  
            backet++;  
            sum += hlist[i].entry_count;  
            if (hlist[i].entry_count > max_link)  
            {  
                max_link = hlist[i].entry_count;  
            }  
            if (hlist[i].entry_count > 1)  
            {  
                conflict_count++;  
            }  
            hit_count += hlist[i].hit_count;  
        }  
    }  
 
    backet_usage = backet/1.0/backet_len * 100;;  
    avg_link = sum/1.0/backet;  
 
    printf("backet_len = %d/n", backet_len);  
    printf("hash_call_count = %d/n", hash_call_count);  
    printf("hit_count = %d/n", hit_count);  
    printf("conflict count = %d/n", conflict_count);  
    printf("longest hash entry = %d/n", max_link);  
    printf("average hash entry length = %.2f/n", avg_link);  
    printf("backet usage = %.2f%/n", backet_usage);  
}  
 
void usage()  
{  
    printf("Usage: hashtest filename hash_func_name [backet_len]/n");  
    printf("hash_func_name:/n");  
    printf("/tsimple_hash/n");  
    printf("/tRS_hash/n");  
    printf("/tJS_hash/n");  
    printf("/tPJW_hash/n");  
    printf("/tELF_hash/n");  
    printf("/tBKDR_hash/n");  
    printf("/tSDBM_hash/n");  
    printf("/tDJB_hash/n");  
    printf("/tAP_hash/n");  
    printf("/tCRC_hash/n");  
}  
 
void md5_to_32(unsigned char *md5_16, unsigned char *md5_32)  
{  
    int i;  
 
    for (i = 0; i < 16; ++i)  
    {  
        sprintf(md5_32 + i * 2, "%02x", md5_16[i]);  
    }  
}  
 
int main(int argc, char *argv[])  
{  
    int fd = -1, rwsize = 0;  
    unsigned char md5_checksum[16 + 1] = {0};  
    unsigned char buf[BLOCK_LEN] = {0};  
 
    if (argc < 3)   
    {  
        usage();  
        return -1;  
    }  
 
    if (-1 == (fd = open(argv[1], O_RDONLY)))  
    {  
        perror("open source file");  
        return errno;  
    }  
 
    if (argc == 4)  
    {  
        backet_len = atoi(argv[3]);  
    }  
 
    hashtest_init();  
    choose_hash_func(argv[2]);  
    while (rwsize = read(fd, buf, BLOCK_LEN))  
    {  
        md5(buf, rwsize, md5_checksum);  
        insert_hash_entry(md5_checksum, hlist);  
        hash_call_count++;  
        memset(buf, 0, BLOCK_LEN);  
        memset(md5_checksum, 0, 16 + 1);  
    }  
    close(fd);  
 
    show_hashtest_result();  
    hashtest_clean();  
    return 0;  
}

本文固定链接: http://www.chepoo.com/hash-c.html | IT技术精华网

【上一篇】
【下一篇】

字符串Hash函数–c语言实现版:等您坐沙发呢!

发表评论