08. 数据结构：哈希表

# 08. 数据结构：哈希表

本章要把上一章服务器里那些占位的代码给补上。咱们先从实现一个哈希表（Hashtables）开始。哈希表这东西，用来存放数量未知且无需排序的键值对数据，那可太合适了。

哈希表有两种：链地址法（chaining）和开放地址法（open addressing）。它们的主要区别就在于处理冲突的方式。开放地址法在遇到冲突时，会去找下一个空闲的槽位；而链地址法呢，就是用链表把冲突的键给串起来。由于找空闲槽位的方式不同，开放地址法有好多变种，相比之下，链地址法的设计就比较固定。咱们服务器里用的就是链地址法的哈希表，这玩意儿写起代码来简单，不用费太多心思做选择。

先看看我们定义的数据类型：

// 哈希表节点，应该嵌入到有效载荷中
struct HNode {
    HNode *next = NULL;
    uint64_t hcode = 0;
};

// 一个简单的固定大小的哈希表
struct HTab {
    HNode **tab = NULL;
    size_t mask = 0;
    size_t size = 0;
};

1
2
3
4
5
6
7
8
9
10
11
12

当哈希表的大小是2的幂次方时，索引操作就只是用哈希码做个简单的按位掩码运算。

// n必须是2的幂次方
static void h_init(HTab *htab, size_t n) {
    assert(n > 0 && ((n - 1) & n) == 0);
    htab->tab = (HNode **)calloc(sizeof(HNode *), n);
    htab->mask = n - 1;
    htab->size = 0;
}

// 哈希表插入操作
static void h_insert(HTab *htab, HNode *node) {
    size_t pos = node->hcode & htab->mask;
    HNode *next = htab->tab[pos];
    node->next = next;
    htab->tab[pos] = node;
    htab->size++;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

查找子程序其实就是遍历链表：

// 哈希表查找子程序。
// 注意返回值。它返回的是拥有目标节点的父指针的地址，
// 这个地址可用来删除目标节点。
static HNode **h_lookup(
    HTab *htab, HNode * key, bool (*cmp)(HNode *, HNode *))
{
    if (!htab->tab) {
        return NULL;
    }

    size_t pos = key->hcode & htab->mask;
    HNode **from = &htab->tab[pos];
    while (*from) {
        if (cmp(*from, key)) {
            return from;
        }
        from = &(*from)->next;
    }
    return NULL;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

删除操作也很简单。注意看，指针的使用让代码简洁了不少。from指针既可以是数组中的一项，也可以来自某个节点，但代码里不用区分。

// 从链表中移除一个节点
static HNode *h_detach(HTab *htab, HNode **from) {
    HNode *node = *from;
    *from = (*from)->next;
    htab->size--;
    return node;
}

1
2
3
4
5
6
7

我们的哈希表大小是固定的，要是负载因子（load factor）太高，就得换个大的。在Redis里用哈希表时，还有个额外的考量。给大哈希表调整大小，得把好多节点挪到新表里去，这可能会让服务器卡顿一会儿。为了避免这种情况，我们不一下子挪动所有节点，而是用两个哈希表，慢慢地把节点在它们之间转移。下面就是最终的哈希表接口：

// 真正的哈希表接口。
// 它用两个哈希表来逐步调整大小。
struct HMap {
    HTab ht1;
    HTab ht2;
    size_t resizing_pos = 0;
};

1
2
3
4
5
6
7

现在查找子程序还得帮忙调整大小：

HNode *hm_lookup(
    HMap *hmap, HNode * key, bool (*cmp)(HNode *, HNode *))
{
    hm_help_resizing(hmap);
    HNode **from = h_lookup(&hmap->ht1, key, cmp);
    if (!from) {
        from = h_lookup(&hmap->ht2, key, cmp);
    }
    return from? *from : NULL;
}

1
2
3
4
5
6
7
8
9
10

hm_help_resizing函数就是用来慢慢挪动节点的子程序：

const size_t k_resizing_work = 128;

static void hm_help_resizing(HMap *hmap) {
    if (hmap->ht2.tab == NULL) {
        return;
    }

    size_t nwork = 0;
    while (nwork < k_resizing_work && hmap->ht2.size > 0) {
        // 从ht2里扫描节点并把它们挪到ht1
        HNode **from = &hmap->ht2.tab[hmap->resizing_pos];
        if (!*from) {
            hmap->resizing_pos++;
            continue;
        }

        h_insert(&hmap->ht1, h_detach(&hmap->ht2, from));
        nwork++;
    }

    if (hmap->ht2.size == 0) {
        // 完成
        free(hmap->ht2.tab);
        hmap->ht2 = HTab{};
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

插入子程序在表太满的时候会触发调整大小操作：

const size_t k_max_load_factor = 8;

void hm_insert(HMap *hmap, HNode *node) {
    if (!hmap->ht1.tab) {
        h_init(&hmap->ht1, 4);
    }

    h_insert(&hmap->ht1, node);

    if (!hmap->ht2.tab) {
        // 检查是否需要调整大小
        size_t load_factor = hmap->ht1.size / (hmap->ht1.mask + 1);
        if (load_factor >= k_max_load_factor) {
            hm_start_resizing(hmap);
        }
    }
    hm_help_resizing(hmap);
}

static void hm_start_resizing(HMap *hmap) {
    assert(hmap->ht2.tab == NULL);
    // 创建一个更大的哈希表并交换它们
    hmap->ht2 = hmap->ht1;
    h_init(&hmap->ht1, (hmap->ht1.mask + 1) * 2);
    hmap->resizing_pos = 0;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

删除键的子程序，没啥特别的。

HNode *hm_pop(
    HMap *hmap, HNode * key, bool (*cmp)(HNode *, HNode *))
{
    hm_help_resizing(hmap);
    HNode **from = h_lookup(&hmap->ht1, key, cmp);
    if (from) {
        return h_detach(&hmap->ht1, from);
    }
    from = h_lookup(&hmap->ht2, key, cmp);
    if (from) {
        return h_detach(&hmap->ht2, from);
    }
    return NULL;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14

哈希表实现完了，把它们加到服务器里吧。再看看HNode结构体，这玩意儿里面没数据，那咋用呢？答案就是“侵入式数据结构（intrusive data structure）”：

// 键的结构
struct Entry {
    struct HNode node;
    std::string key;
    std::string val;
};

1
2
3
4
5
6

我们没让数据结构包含数据，而是把哈希表节点结构嵌入到有效载荷数据里。这在C语言里可是创建通用数据结构的标准操作。这种技术不仅能让数据结构完全通用，还能减少不必要的内存管理。节点结构不是单独分配的，而是有效载荷数据的一部分，数据结构代码并不拥有有效载荷，只是负责组织数据。要是你从教科书上学数据结构，可能接触的是void *、C++模板，甚至宏，那这种方式对你来说可能还挺新鲜。

列出do_get函数，看看侵入式数据结构是咋用的：

// 键空间的数据结构。
static struct {
    HMap db;
} g_data;

static uint32_t do_get(
    std::vector<std::string>  &cmd, uint8_t * res, uint32_t * reslen)
{
    Entry key;
    key.key.swap(cmd[1]);
    key.node.hcode = str_hash((uint8_t *)key.key.data(), key.key.size());

    HNode *node = hm_lookup(&g_data.db, &key.node, &entry_eq);
    if (!node) {
        return RES_NX;
    }

    const std::string &val = container_of(node, Entry, node)->val;
    assert(val.size() <= k_max_msg);
    memcpy(res, val.data(), val.size());
    *reslen = (uint32_t)val.size();
    return RES_OK;
}

static bool entry_eq(HNode *lhs, HNode * rhs) {
    struct Entry *le = container_of(lhs, struct Entry, node);
    struct Entry * re = container_of(rhs, struct Entry, node);
    return lhs->hcode == rhs->hcode && le->key == re->key;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

hm_lookup函数返回一个指向HNode的指针，而HNode是Entry的一个成员，我们得用点指针运算把这个指针转成Entry指针。在C项目里，container_of宏就是干这个用的：

#define container_of(ptr, type, member)  ({                                         \
    const typeof(  ((type *)0)->member )  *     mptr =  (ptr);        \
    (type *)(  (char *)      mptr -  offsetof(type, member)  );})

1
2
3

do_set和do_del函数都很简单。

static uint32_t do_set(
    std::vector<std::string>  &cmd, uint8_t * res, uint32_t * reslen)
{
    (void)res;
    (void)reslen;

    Entry key;
    key.key.swap(cmd[1]);
    key.node.hcode = str_hash((uint8_t *)key.key.data(), key.key.size());

    HNode *node = hm_lookup(&g_data.db, &key.node, &entry_eq);
    if (node) {
        container_of(node, Entry, node)->val.swap(cmd[2]);
    } else {
        Entry *ent = new Entry();
        ent->key.swap(key.key);
        ent->node.hcode = key.node.hcode;
        ent->val.swap(cmd[2]);
        hm_insert(&g_data.db, &ent->node);
    }
    return RES_OK;
}

static uint32_t do_del(
    std::vector<std::string>  &cmd, uint8_t * res, uint32_t * reslen)
{
    (void)res;
    (void)reslen;

    Entry key;
    key.key.swap(cmd[1]);
    key.node.hcode = str_hash((uint8_t *)key.key.data(), key.key.size());

    HNode *node = hm_pop(&g_data.db, &key.node, &entry_eq);
    if (node) {
        delete container_of(node, Entry, node);
    }
    return RES_OK;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

# 练习

我们的哈希表在负载因子太高时会触发调整大小操作，那负载因子太低的时候，是不是也该缩小哈希表呢？能自动进行缩小操作吗？

08_server.cpp
hashtable.cpp
hashtable.h

上次更新: 2025/03/25, 00:48:42

← 07. 基础服务器：实现get、set、del功能 09. 数据序列化→