OS-07 Deadlocks

记录操作系统中死锁相关内容。

System Model

Notations:

Threads T1, T2, …, Tn
Resource types R1, R2, …, Rm
- e.g., CPU, memory space, I/O devices, mutex and semaphores
Each resource type Ri has Wi instances.

定义：当一组线程中的每个线程都在等待仅由该组中其他线程引起的事件时，这组线程就处于死锁状态。这些事件主要包括资源的获取和释放。

eg. 双向车辆在一车道会车

Deadlock Characterization

同时满足以下四个条件导致死锁：

Mutual Exclusion: 互斥，该时刻资源被一个线程独占
Hold and Wait: 一个线程在等待另一个线程释放资源
No Preemption: 一个线程不能抢占另一个线程的资源
Circular Wait: 一组线程形成一个环，每个线程都在等待另一个线程释放资源

Resource-Allocation Graph

Resource-Allocation Graph (RAG) 是一个有向图，用于表示资源分配和请求的关系。

节点表示进程和资源
边表示进程对资源的请求和释放

Methods for Handling Deadlocks

Never enters a deadlock
- 预防死锁 prevention
- 避免死锁 avoidance
Detect and recover from deadlock
- 检测死锁 detection
- 从死锁中恢复 recovery

Deadlock Prevention

破坏四个条件之一:

互斥：去除共享资源的互斥性
占有并等待：一次性申请所有资源/仅在无任何资源才请求
不可抢占：允许抢占资源
循环等待：对资源进行排序

Deadlock Avoidance

需要额外信息支持：

每个进程对每种资源的最大需求
每种资源的可用数量

safe state：一个序列是安全的，如果对于每个进程，它可以在有限时间内完成，并且所需的资源可以被当前可用的资源和以前分配的资源满足

分配资源策略：当存在一个安全序列时，分配资源

resource-allocation-graph algorithm

相关概念：引入了新的概念——声称边（Claim edge），用有向虚线Ti → Rj表示线程Ti可能请求资源Rj。当线程请求资源时，声称边转换为请求边；资源分配给线程时，请求边转换为分配边；线程释放资源时，分配边重新转换为声称边（若线程结束，该边移除）。
算法核心判断：当线程Ti请求资源Rj时，只有在将请求边转换为分配边后，资源分配图中不会形成圈的情况下，该请求才能被批准。若形成圈，则意味着可能出现死锁，请求不能被批准，线程需要等待。比如，若线程T1请求资源R1，在当前资源分配图的基础上，如果将T1到R1的请求边转换为分配边后，图中出现了T1 → R1 → T2 → R2 → T1这样的圈，那么T1对R1的请求就不能被批准，T1需等待

Banker’s Algorithm

银行家算法(Banker’s Algorithm)是一种著名的死锁避免算法。其名字的由来是因为该算法原本是银行用来分配资源的策略。

基本概念

数据结构
- Available: 长度为m的向量，表示每种资源的可用数量
- Max: n×m矩阵，表示每个进程对各种资源的最大需求
- Allocation: n×m矩阵，表示每个进程已分配的各种资源数量
- Need: n×m矩阵，表示每个进程还需要的各种资源数量
  其中，n为进程数，m为资源类型数
安全状态
系统处于安全状态意味着存在一个安全序列，使得所有进程能够按照这个序列顺序完成。

算法流程

#include <stdio.h>
#include <stdbool.h>

// 定义线程数量和资源类型数量
#define N 5
#define M 3

// 安全状态检查函数
bool isSafe(int available[], int max[][M], int allocation[][M], int need[][M], int *safeSequence) {
    int work[M];
    bool finish[N] = {false};
    int count = 0;

    // 初始化work向量
    for (int i = 0; i < M; i++) {
        work[i] = available[i];
    }

    // 寻找安全序列
    while (count < N) {
        bool found = false;
        for (int i = 0; i < N; i++) {
            if (!finish[i]) {
                int j;
                for (j = 0; j < M; j++) {
                    if (need[i][j] > work[j]) {
                        break;
                    }
                }
                if (j == M) {
                    // 可以满足线程i的需求
                    for (int k = 0; k < M; k++) {
                        work[k] += allocation[i][k];
                    }
                    safeSequence[count++] = i;
                    finish[i] = true;
                    found = true;
                }
            }
        }
        if (!found) {
            break;
        }
    }

    return count == N;
}

// 资源请求处理函数
bool requestResources(int available[], int max[][M], int allocation[][M], int need[][M], int request[], int threadId) {
    // 检查请求是否超过最大需求
    for (int i = 0; i < M; i++) {
        if (request[i] > need[threadId][i]) {
            printf("线程 %d 请求的资源超过最大需求，错误！\n", threadId);
            return false;
        }
    }

    // 检查请求是否超过可用资源
    for (int i = 0; i < M; i++) {
        if (request[i] > available[i]) {
            printf("线程 %d 请求的资源超过可用资源，等待！\n", threadId);
            return false;
        }
    }

    // 尝试分配资源
    for (int i = 0; i < M; i++) {
        available[i] -= request[i];
        allocation[threadId][i] += request[i];
        need[threadId][i] -= request[i];
    }

    int safeSequence[N];
    if (isSafe(available, max, allocation, need, safeSequence)) {
        printf("线程 %d 的资源请求已批准！\n", threadId);
        return true;
    } else {
        // 恢复资源分配状态
        for (int i = 0; i < M; i++) {
            available[i] += request[i];
            allocation[threadId][i] -= request[i];
            need[threadId][i] += request[i];
        }
        printf("线程 %d 的资源请求会导致死锁，等待！\n", threadId);
        return false;
    }
}

int main() {
    // 初始化资源信息
    int available[M] = {3, 3, 2};
    int max[N][M] = {
        {7, 5, 3},
        {3, 2, 2},
        {9, 0, 2},
        {2, 2, 2},
        {4, 3, 3}
    };
    int allocation[N][M] = {
        {0, 1, 0},
        {2, 0, 0},
        {3, 0, 2},
        {2, 1, 1},
        {0, 0, 2}
    };
    int need[N][M];

    // 计算Need矩阵
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            need[i][j] = max[i][j] - allocation[i][j];
        }
    }

    // 检查初始状态是否安全
    int safeSequence[N];
    if (isSafe(available, max, allocation, need, safeSequence)) {
        printf("系统处于安全状态，安全序列: ");
        for (int i = 0; i < N; i++) {
            printf("T%d ", safeSequence[i]);
        }
        printf("\n");
    } else {
        printf("系统处于不安全状态！\n");
    }

    // 处理线程请求
    int request[M] = {1, 0, 2};
    int threadId = 1;
    requestResources(available, max, allocation, need, request, threadId);

    return 0;
}

Deadlock Detection

Single instance of each resource type: 维护一个等待图

Multiple instances of each resource type:

available vector
allocation matrix
request matrix

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

#define MAX_PROCESSES 100
#define MAX_RESOURCES 100

/**
 * @brief 死锁检测算法的数据结构
 */
typedef struct {
    int available[MAX_RESOURCES];     // 可用资源向量
    int allocation[MAX_PROCESSES][MAX_RESOURCES];    // 已分配矩阵
    int request[MAX_PROCESSES][MAX_RESOURCES];       // 请求矩阵
    int num_processes;    // 进程数量
    int num_resources;    // 资源类型数量
} SystemState;

/**
 * @brief 检查进程是否可以完成
 */
bool can_process_finish(int process_id, bool* finished, int* work, SystemState* state) {
    // 检查是否所有请求都可以被满足
    for (int j = 0; j < state->num_resources; j++) {
        if (state->request[process_id][j] > work[j]) {
            return false;
        }
    }
    return true;
}

/**
 * @brief 更新工作向量
 */
void update_work_vector(int process_id, int* work, SystemState* state) {
    for (int j = 0; j < state->num_resources; j++) {
        work[j] += state->allocation[process_id][j];
    }
}

/**
 * @brief 死锁检测算法主函数
 * @return 返回死锁进程数量，0表示无死锁
 */
int detect_deadlock(SystemState* state, int* deadlocked_processes) {
    bool* finished = (bool*)calloc(state->num_processes, sizeof(bool));
    int* work = (int*)malloc(state->num_resources * sizeof(int));
    int deadlock_count = 0;
    
    // 初始化工作向量
    for (int i = 0; i < state->num_resources; i++) {
        work[i] = state->available[i];
    }

    // 标记没有请求的进程为已完成
    for (int i = 0; i < state->num_processes; i++) {
        bool has_request = false;
        for (int j = 0; j < state->num_resources; j++) {
            if (state->request[i][j] > 0) {
                has_request = true;
                break;
            }
        }
        if (!has_request) {
            finished[i] = true;
        }
    }

    // 死锁检测主循环
    bool changed;
    do {
        changed = false;
        for (int i = 0; i < state->num_processes; i++) {
            if (!finished[i] && can_process_finish(i, finished, work, state)) {
                finished[i] = true;
                update_work_vector(i, work, state);
                changed = true;
            }
        }
    } while (changed);

    // 统计死锁进程
    for (int i = 0; i < state->num_processes; i++) {
        if (!finished[i]) {
            deadlocked_processes[deadlock_count++] = i;
        }
    }

    free(finished);
    free(work);
    return deadlock_count;
}

/**
 * @brief 打印死锁检测结果
 */
void print_deadlock_result(int* deadlocked_processes, int count) {
    if (count == 0) {
        printf("系统中没有死锁\n");
    } else {
        printf("检测到死锁！以下进程处于死锁状态：\n");
        for (int i = 0; i < count; i++) {
            printf("进程 P%d\n", deadlocked_processes[i]);
        }
    }
}

/**
 * @brief 示例使用
 */
int main() {
    SystemState state = {
        .num_processes = 3,
        .num_resources = 3,
        .available = {0, 0, 0},  // 当前可用资源
        .allocation = {          // 已分配资源
            {3, 3, 3},
            {2, 0, 3},
            {1, 2, 4}
        },
        .request = {            // 请求资源
            {0, 1, 0},
            {2, 0, 0},
            {0, 0, 2}
        }
    };

    int deadlocked_processes[MAX_PROCESSES];
    int deadlock_count = detect_deadlock(&state, deadlocked_processes);
    
    print_deadlock_result(deadlocked_processes, deadlock_count);
    
    return 0;
}

核心算法：通过work向量和finish向量来判断是否存在死锁，循环迭代查找可完成进程

使用检测算法，要考虑死锁发生频率，回滚线程数量等开销

Recovery from Deadlock

终止进程：终止一个/多个进程，直到死锁解除
抢占资源：抢占一个/多个进程的资源，直到死锁解除