進程管理

1 進程描述符及任務結構
- 分配進程描述符
- 進程描述符的存放
- 進程狀態
- 設置當前進程狀態
- 進程上下文
- 進程家族樹
2 進程創建
- 寫時拷貝
- fork()
- vfork()
3 線程在Linux中的實現
- - 內核線程
4 進程終結
- 刪除進程描述符
- 孤兒進程造成的進退微谷
5 小結

進程的另一個名字叫做任務（task）。Linux內核通常把進程也叫做任務。下面我們會交替使用任務和進程兩個術語。

1 進程描述符及任務結構

內核把進程存放在叫作任務隊列（task list）的雙向循環鏈表中。鏈表中的每一項都是類型為task_struct、稱為進程描述符的結構，該結構定義在include/linux/sched.h文件中。進程描述符包含一個具體進程的所有信息。

struct task_struct {volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */struct thread_info *thread_info;atomic_t usage;unsigned long flags;	/* per process flags, defined below */unsigned long ptrace;int lock_depth;		/* Lock depth */int prio, static_prio;struct list_head run_list;prio_array_t *array;unsigned long sleep_avg;long interactive_credit;unsigned long long timestamp, last_ran;int activated;unsigned long policy;cpumask_t cpus_allowed;unsigned int time_slice, first_time_slice;#ifdef CONFIG_SCHEDSTATSstruct sched_info sched_info;
#endifstruct list_head tasks;/** ptrace_list/ptrace_children forms the list of my children* that were stolen by a ptracer.*/struct list_head ptrace_children;struct list_head ptrace_list;struct mm_struct *mm, *active_mm;/* task state */struct linux_binfmt *binfmt;long exit_state;int exit_code, exit_signal;int pdeath_signal;  /*  The signal sent when the parent dies  *//* ??? */unsigned long personality;unsigned did_exec:1;pid_t pid;pid_t tgid;/* * pointers to (original) parent process, youngest child, younger sibling,* older sibling, respectively.  (p->father can be replaced with * p->parent->pid)*/struct task_struct *real_parent; /* real parent process (when being debugged) */struct task_struct *parent;	/* parent process *//** children/sibling forms the list of my children plus the* tasks I'm ptracing.*/struct list_head children;	/* list of my children */struct list_head sibling;	/* linkage in my parent's children list */struct task_struct *group_leader;	/* threadgroup leader *//* PID/PID hash table linkage. */struct pid pids[PIDTYPE_MAX];wait_queue_head_t wait_chldexit;	/* for wait4() */struct completion *vfork_done;		/* for vfork() */int __user *set_child_tid;		/* CLONE_CHILD_SETTID */int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */unsigned long rt_priority;unsigned long it_real_value, it_prof_value, it_virt_value;unsigned long it_real_incr, it_prof_incr, it_virt_incr;struct timer_list real_timer;unsigned long utime, stime;unsigned long nvcsw, nivcsw; /* context switch counts */struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */unsigned long min_flt, maj_flt;
/* process credentials */uid_t uid,euid,suid,fsuid;gid_t gid,egid,sgid,fsgid;struct group_info *group_info;kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;unsigned keep_capabilities:1;struct user_struct *user;
#ifdef CONFIG_KEYSstruct key *session_keyring;	/* keyring inherited over fork */struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */struct key *thread_keyring;	/* keyring private to this thread */
#endifunsigned short used_math;char comm[16];
/* file system info */int link_count, total_link_count;
/* ipc stuff */struct sysv_sem sysvsem;
/* CPU-specific state of this task */struct thread_struct thread;
/* filesystem information */struct fs_struct *fs;
/* open file information */struct files_struct *files;
/* namespace */struct namespace *namespace;
/* signal handlers */struct signal_struct *signal;struct sighand_struct *sighand;sigset_t blocked, real_blocked;struct sigpending pending;unsigned long sas_ss_sp;size_t sas_ss_size;int (*notifier)(void *priv);void *notifier_data;sigset_t *notifier_mask;void *security;struct audit_context *audit_context;/* Thread group tracking */u32 parent_exec_id;u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */spinlock_t proc_lock;
/* context-switch lock */spinlock_t switch_lock;/* journalling filesystem info */void *journal_info;/* VM state */struct reclaim_state *reclaim_state;struct dentry *proc_dentry;struct backing_dev_info *backing_dev_info;struct io_context *io_context;unsigned long ptrace_message;siginfo_t *last_siginfo; /* For ptrace use.  */
/** current io wait handle: wait queue entry to use for io waits* If this thread is processing aio, this points at the waitqueue* inside the currently handled kiocb. It may be NULL (i.e. default* to a stack based synchronous wait) if its doing sync IO.*/wait_queue_t *io_wait;
#ifdef CONFIG_NUMAstruct mempolicy *mempolicy;short il_next;		/* could be shared with used_math */
#endif
};

分配進程描述符

Linux通過slab分配器分配task_struct結構，這樣能達到對象復用和緩存著色的目的（通過預先分配和重復使用task_struct，可以避免動態分配和釋放所帶來的資源消耗）。在2.6以前的內核中，各個進程的task_struct存放在它們內核棧的尾端。這樣做是為了讓那些x86這樣寄存器較少的硬件體系結構只要通過棧指針就能計算出它的位置。由于現在用slab分配器動態生成task_struct，所以只需在棧底（對于向下增長的棧來說）或棧頂（對于向上增長的的棧來說）創建一個新的結構struct thread_info，thread_info有一個指向進程描述符的指針。
在這里插入圖片描述

在x86_64上，thread_info結構在文件include/asm-x86_64/thread_info.h中

struct thread_info {struct task_struct	*task;		/* main task structure */struct exec_domain	*exec_domain;	/* execution domain */__u32			flags;		/* low level flags */__u32			status;		/* thread synchronous flags */__u32			cpu;		/* current CPU */int 			preempt_count;mm_segment_t		addr_limit;	struct restart_block    restart_block;
};

每個任務的thread_info結構在它的內核棧的尾部分配。結構中task域存放的是指向該任務實際task_struct的指針。每個進程都有一個thread_info結構，指向自己的task_struct進程描述符。

進程描述符的存放

內核通過一個唯一的進程標識值或PID來標識每個進程。PID是pid_t類型，是一個int類型。為了與老版本的Unix和Linux兼容，PID的最大值默認設置為32768，內核把每個進程的PID存放在它們各自的進程描述符中。
這個最大值很重要，因為它實際上就是系統中允許同時存在的進程的最大數目。如果確實需要的話，由系統管理員通過修改/proc/sys/kernel/pid_max來提高上線。
在內核中，訪問任務通常需要獲得指向其task_struct指針。可以通過current宏查找到當前正在運行進程的進程描述符。有的硬件體系結構可以拿出一個專門寄存器來存放當前進程task_struct的地址。
在x86系統上，current宏定義在include/asm-m32r/current.h文件中。current把棧指針的后13個有效數字位屏蔽掉，用來計算出thread_info的偏移，因為thread_info結構存放在它的內核棧的尾端。

static __inline__ struct task_struct *get_current(void)
{return current_thread_info()->task;
}#define current	(get_current())static inline struct thread_info *current_thread_info(void)
{struct thread_info *ti;__asm__ __volatile__ ("ldi	%0, #0xffffe000;	\n\t""and	%0, sp;			\n\t": "=r" (ti));return ti;
}

這里內核棧的大小是8KB，兩頁，13位可以標識8kb內存地址，屏蔽13位剛剛好指向棧的尾端。

進程狀態

進程描述符中的state域描述了進程的當前狀態

volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */

系統中的每個進程都必然處于五種進程狀態中的一種。該域的值也必然為下列五種狀態標志之一：

TASK_RUNNING（運行）：進程是可執行的；它或者正在執行，或者在運行隊列中等待執行
TASK_INTERRUPTIBLE（可中斷）：進程正在睡眠（也就是被阻塞），等待某些條件的達成。一旦這些條件達成，內核就會把進程狀態設置為運行。
TASK_UNINTERRUPTIBLE（不可中斷）：除了不會因為接受到信號而被喚醒從而投入運行外，這個狀態與可打斷狀態相同。
TASK_ZOMBIE（僵死）：該進程已經結束了，但是符父進程還沒有調用wait4()系統調用，釋放相關資源。
TASK_STOPPED（停止）：進程停止運行。

請添加圖片描述

設置當前進程狀態

內核經常需要調整某個進程的狀態。這時最好使用set_task_state函數：

set_task_state(task,state)//將任務task的狀態設置為state
#define set_task_state(tsk, state_value)		\set_mb((tsk)->state, (state_value))

該函數將指定的進程設置為指定的狀態。位置在/include/linux/sched.h文件中。
方法set_current_state（state）和set_task_state（current, state)含義是相同的。

進程上下文

一般程序在用戶空間執行。當一個程序執行了系統調用或者觸發了某個異常，它就陷入了內核空間。此時，我們稱內核"代表進程執行"并處于進程上下文中。在此上下文中的current宏是有效的。除非在此間隙有更高優先級的進程需要執行并由調度器作出了相應調整，否則在內核退出的時候，程序恢復在用戶空間繼續執行。

進程家族樹

Unix系統的進程之間存在明顯的繼承關系，在Linux系統中也是如此。所有的進程都是PID為1的init進程的后代。系統中的每個進程必有一個父進程。相應的，每個進程也可以擁有零個或多個子進程。進程間的關系存放在進程描述符中。每個task_struct都包含一個指向其父進程進程描述符的parent指向，還包含一個稱為children的子進程鏈表。所以對于當前進程，可以通過下面的代碼獲得其父進程的進程描述符：

struct task_struct *my_parent = current->parent;	//current是指向當前進程的進程描述符指針

同樣，也可以按以下方式依次訪問子進程

struct task_struct *task;
struct list_head *list;
list_for_each(list,&current->children){task = list_entry(list,struct task_struct,sibling);/* task現在指向當前的某個子進程 */
}

2 進程創建

Unix的進程創建很特別。許多其他的操作系統都提供了產生進程的機制，首先在新的地址空間里創建進程，讀入可執行文件，最后開始執行。Unix把上述步驟分解到兩個單獨的函數中去執行：fork()和exec()。首先，fork()通過拷貝當前進程創建一個子進程。然后exec()函數負責讀取可執行文件并將其載入地址空間開始執行。

寫時拷貝

Linux的fork()使用寫時拷貝(copy-on-write)頁實現。寫時拷貝是一種可以推遲甚至免除拷貝數據的技術。當調用fork時，內核此時并不復制整個進程地址空間，而是讓父進程和子進程共享同一個拷貝。只有在需要寫入的時候，數據才會被復制，從而使各個進程擁有各自的拷貝。也就是說，資源的復制只有在需要寫入的時候才進行，在此之前，只是以只讀方式共享。

fork()

Linux通過clone()系統調用實現 fork()。這個調用通過一系列的參數標志來指明父子進程需要共享的資源。fork()、vfork()和__clone()庫函數都根據各自需要的參數標志去調用clone()。然后由clone()去調用do_fork()。
do_fork()完成了創建中的大部分工作，它的定義在kernel/fork.c文件中。該函數調用copy_process函數，然后讓進程開始運行。copy_process函數完成的工作很有意思：

static task_t *copy_process(unsigned long clone_flags,unsigned long stack_start,struct pt_regs *regs,unsigned long stack_size,int __user *parent_tidptr,int __user *child_tidptr,int pid)
{int retval;struct task_struct *p = NULL;if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))return ERR_PTR(-EINVAL);/** Thread groups must share signals as well, and detached threads* can only be started up within the thread group.*/if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))return ERR_PTR(-EINVAL);/** Shared signal handlers imply shared VM. By way of the above,* thread groups also imply shared VM. Blocking this case allows* for various simplifications in other code.*/if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))return ERR_PTR(-EINVAL);retval = security_task_create(clone_flags);if (retval)goto fork_out;retval = -ENOMEM;p = dup_task_struct(current);if (!p)goto fork_out;retval = -EAGAIN;if (atomic_read(&p->user->processes) >=p->signal->rlim[RLIMIT_NPROC].rlim_cur) {if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&p->user != &root_user)goto bad_fork_free;}atomic_inc(&p->user->__count);atomic_inc(&p->user->processes);get_group_info(p->group_info);/** If multiple threads are within copy_process(), then this check* triggers too late. This doesn't hurt, the check is only there* to stop root fork bombs.*/if (nr_threads >= max_threads)goto bad_fork_cleanup_count;if (!try_module_get(p->thread_info->exec_domain->module))goto bad_fork_cleanup_count;if (p->binfmt && !try_module_get(p->binfmt->module))goto bad_fork_cleanup_put_domain;p->did_exec = 0;copy_flags(clone_flags, p);p->pid = pid;retval = -EFAULT;if (clone_flags & CLONE_PARENT_SETTID)if (put_user(p->pid, parent_tidptr))goto bad_fork_cleanup;p->proc_dentry = NULL;INIT_LIST_HEAD(&p->children);INIT_LIST_HEAD(&p->sibling);init_waitqueue_head(&p->wait_chldexit);p->vfork_done = NULL;spin_lock_init(&p->alloc_lock);spin_lock_init(&p->proc_lock);clear_tsk_thread_flag(p, TIF_SIGPENDING);init_sigpending(&p->pending);p->it_real_value = p->it_virt_value = p->it_prof_value = 0;p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;init_timer(&p->real_timer);p->real_timer.data = (unsigned long) p;p->utime = p->stime = 0;p->lock_depth = -1;		/* -1 = no lock */do_posix_clock_monotonic_gettime(&p->start_time);p->security = NULL;p->io_context = NULL;p->io_wait = NULL;p->audit_context = NULL;
#ifdef CONFIG_NUMAp->mempolicy = mpol_copy(p->mempolicy);if (IS_ERR(p->mempolicy)) {retval = PTR_ERR(p->mempolicy);p->mempolicy = NULL;goto bad_fork_cleanup;}
#endifp->tgid = p->pid;if (clone_flags & CLONE_THREAD)p->tgid = current->tgid;if ((retval = security_task_alloc(p)))goto bad_fork_cleanup_policy;if ((retval = audit_alloc(p)))goto bad_fork_cleanup_security;/* copy all the process information */if ((retval = copy_semundo(clone_flags, p)))goto bad_fork_cleanup_audit;if ((retval = copy_files(clone_flags, p)))goto bad_fork_cleanup_semundo;if ((retval = copy_fs(clone_flags, p)))goto bad_fork_cleanup_files;if ((retval = copy_sighand(clone_flags, p)))goto bad_fork_cleanup_fs;if ((retval = copy_signal(clone_flags, p)))goto bad_fork_cleanup_sighand;if ((retval = copy_mm(clone_flags, p)))goto bad_fork_cleanup_signal;if ((retval = copy_keys(clone_flags, p)))goto bad_fork_cleanup_mm;if ((retval = copy_namespace(clone_flags, p)))goto bad_fork_cleanup_keys;retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);if (retval)goto bad_fork_cleanup_namespace;p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;/** Clear TID on mm_release()?*/p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;/** Syscall tracing should be turned off in the child regardless* of CLONE_PTRACE.*/clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);/* Our parent execution domain becomes current domainThese must match for thread signalling to apply */p->parent_exec_id = p->self_exec_id;/* ok, now we should be set up.. */p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);p->pdeath_signal = 0;p->exit_state = 0;/* Perform scheduler related setup */sched_fork(p);/** Ok, make it visible to the rest of the system.* We dont wake it up yet.*/p->group_leader = p;INIT_LIST_HEAD(&p->ptrace_children);INIT_LIST_HEAD(&p->ptrace_list);/* Need tasklist lock for parent etc handling! */write_lock_irq(&tasklist_lock);/** The task hasn't been attached yet, so cpus_allowed mask cannot* have changed. The cpus_allowed mask of the parent may have* changed after it was copied first time, and it may then move to* another CPU - so we re-copy it here and set the child's CPU to* the parent's CPU. This avoids alot of nasty races.*/p->cpus_allowed = current->cpus_allowed;set_task_cpu(p, smp_processor_id());/** Check for pending SIGKILL! The new thread should not be allowed* to slip out of an OOM kill. (or normal SIGKILL.)*/if (sigismember(&current->pending.signal, SIGKILL)) {write_unlock_irq(&tasklist_lock);retval = -EINTR;goto bad_fork_cleanup_namespace;}/* CLONE_PARENT re-uses the old parent */if (clone_flags & (CLONE_PARENT|CLONE_THREAD))p->real_parent = current->real_parent;elsep->real_parent = current;p->parent = p->real_parent;if (clone_flags & CLONE_THREAD) {spin_lock(&current->sighand->siglock);/** Important: if an exit-all has been started then* do not create this new thread - the whole thread* group is supposed to exit anyway.*/if (current->signal->group_exit) {spin_unlock(&current->sighand->siglock);write_unlock_irq(&tasklist_lock);retval = -EAGAIN;goto bad_fork_cleanup_namespace;}p->group_leader = current->group_leader;if (current->signal->group_stop_count > 0) {/** There is an all-stop in progress for the group.* We ourselves will stop as soon as we check signals.* Make the new thread part of that group stop too.*/current->signal->group_stop_count++;set_tsk_thread_flag(p, TIF_SIGPENDING);}spin_unlock(&current->sighand->siglock);}SET_LINKS(p);if (unlikely(p->ptrace & PT_PTRACED))__ptrace_link(p, current->parent);attach_pid(p, PIDTYPE_PID, p->pid);attach_pid(p, PIDTYPE_TGID, p->tgid);if (thread_group_leader(p)) {attach_pid(p, PIDTYPE_PGID, process_group(p));attach_pid(p, PIDTYPE_SID, p->signal->session);if (p->pid)__get_cpu_var(process_counts)++;}nr_threads++;write_unlock_irq(&tasklist_lock);retval = 0;fork_out:if (retval)return ERR_PTR(retval);return p;bad_fork_cleanup_namespace:exit_namespace(p);
bad_fork_cleanup_keys:exit_keys(p);
bad_fork_cleanup_mm:if (p->mm)mmput(p->mm);
bad_fork_cleanup_signal:exit_signal(p);
bad_fork_cleanup_sighand:exit_sighand(p);
bad_fork_cleanup_fs:exit_fs(p); /* blocking */
bad_fork_cleanup_files:exit_files(p); /* blocking */
bad_fork_cleanup_semundo:exit_sem(p);
bad_fork_cleanup_audit:audit_free(p);
bad_fork_cleanup_security:security_task_free(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMAmpol_free(p->mempolicy);
#endif
bad_fork_cleanup:if (p->binfmt)module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:module_put(p->thread_info->exec_domain->module);
bad_fork_cleanup_count:put_group_info(p->group_info);atomic_dec(&p->user->processes);free_uid(p->user);
bad_fork_free:free_task(p);goto fork_out;
}

調用dup_task_struct()為新進程創建一個內核棧、thread_info結構和task_struct，這些值與當前進程的值相同。此時，子進程和父進程的描述符是完全相同的。
檢查新創建的這個子進程后，當前用戶所擁有的進程數目沒有超過給他分配的資源的限制
現在，子進程著手使自己與父進程區別開來。進程描述符內的許多成員都要被清0或設為初始值。
接下來，子進程的狀態被設置為TASK_UNINTERRUPTIBLE以保證它不會投入運行
copy_process()調用copy_flags()以更新task_struct的flags成員。
調用get_pid()為新進程獲取一個有效的PID
根據傳遞給clone的參數標志，copy_process拷貝或共享打開的文件、文件系統消息等。
讓父進程和子進程平分剩余的時間片
最后，copy_process做掃尾工作并放回一個指向子進程的指針。
再回到do_fork()函數，如果copy_process()成功返回，新創建的子進程被喚醒并讓其投入運行。

vfork()

vfork系統調用和fork()的功能相同，除了不拷貝父進程的頁表項。子進程作為父進程的一個單獨的線程在它的地址空間運行，父進程被阻塞，直到子進程退出或執行exec()。
vfork系統調用的實現是通過向clone系統調用傳遞一個特殊標志進行的。

在調用copy_process()時，task_struct的vfork_done成員被設置為NULL
在執行do_fork()時，如果給定特別標志，則vfork_done會指向一個特殊地址
子進程開始執行后，父進程不是立馬恢復執行，而是一直等待，直到子進程通過vfork_done指針向它發送信號
在調用mm_release()時，該函數用于進程退出內存地址空間，并且檢查vfork_done是否為空，如果不為空，則會向父進程發送信號。
回到do_fork()，父進程醒來并返回。

3 線程在Linux中的實現

Linux實現線程的機制非常獨特。從內核角度來說，它并沒有線程這個概念。Linux把所有的線程都當做進程來實現。內核并沒有準備特別的調度算法或是定義特別的數據結構來表示線程。相反，線程僅僅被視為一個與其他進程共享某些資源的**進程。**每個進程都有唯一自己的task_struct。所以在內核中，它看起來像一個普通的線程。
線程的創建和普通進程的創建類似，只不過在調用clone的時候需要傳遞一些參數標志來指明需要的共享資源：

clone(CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND,0)

上面的代碼產生的結構和調用fork差不多，只是父子進程共享地址空間、文件系統資源、文件描述符和信號處理程序。
一個普通的fork實現是：

clone(CLONE_VFORK | CLONE_VM  | CLONE_SIGHAND,0)

傳遞給clone的參數標志決定了新創建進程的行為方式和父子進程之間共享的資源種類。這些參數標志定義在include/linux/sched.h文件中。

/** cloning flags:*/
#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
#define CLONE_VM	0x00000100	/* set if VM shared between processes */
#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
#define CLONE_THREAD	0x00010000	/* Same thread group? */
#define CLONE_NEWNS	0x00020000	/* New namespace group? */
#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
#define CLONE_STOPPED		0x02000000	/* Start in stopped state */

內核線程

內核線程是獨立運行在內核空間的標準進程，內核線程和普通的進程間的區別在于內核線程沒有獨立的地址空間（mm指針被設置為NULL）。它們只在內核空間運行，從來不切換到用戶空間去。內核進程和普通進程一樣，可以被調度，也可以被搶占。
內核線程只能由其他內核線程創建，在現有內核線程創建一個新的內核線程的方法如下：

int kernel_thread(int (*fn)(void*),void *arg,unsigned long flags);

新的任務也是通過向普通的clone()系統調用傳遞特定的flags參數而創建的。在上面的函數返回時，父線程退出，并返回一個指向子線程task_struct的指針。

4 進程終結

當一個進程終結時，內核必須釋放它所占有的資源并把這一消息傳給父進程。不論進程是怎樣終結的，該任務大部分都要靠do_exit()來完成，do_exit()定義在kernel/exit.c文件中

fastcall NORET_TYPE void do_exit(long code)
{struct task_struct *tsk = current;int group_dead;profile_task_exit(tsk);if (unlikely(in_interrupt()))panic("Aiee, killing interrupt handler!");if (unlikely(!tsk->pid))panic("Attempted to kill the idle task!");if (unlikely(tsk->pid == 1))panic("Attempted to kill init!");if (tsk->io_context)exit_io_context();tsk->flags |= PF_EXITING;del_timer_sync(&tsk->real_timer);if (unlikely(in_atomic()))printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",current->comm, current->pid,preempt_count());if (unlikely(current->ptrace & PT_TRACE_EXIT)) {current->ptrace_message = code;ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);}group_dead = atomic_dec_and_test(&tsk->signal->live);if (group_dead)acct_process(code);__exit_mm(tsk);exit_sem(tsk);__exit_files(tsk);__exit_fs(tsk);exit_namespace(tsk);exit_thread();exit_keys(tsk);if (group_dead && tsk->signal->leader)disassociate_ctty(1);module_put(tsk->thread_info->exec_domain->module);if (tsk->binfmt)module_put(tsk->binfmt->module);tsk->exit_code = code;exit_notify(tsk);
#ifdef CONFIG_NUMAmpol_free(tsk->mempolicy);tsk->mempolicy = NULL;
#endifBUG_ON(!(current->flags & PF_DEAD));schedule();BUG();/* Avoid "noreturn function does return".  */for (;;) ;
}

首先，將task_struct中的標志成員設置為PF_EXITING
其次，調用del_timer_sync()喊出任一內核定時器
如果BSD的進程記賬功能是開啟的，do_exit()調用acct_process()來輸出記賬信息。
然后調用_exit_mm()函數放棄進程占用的mm_struct，如果沒有別的進程使用它們，就徹底釋放它們
接下來調用exit_sem()函數。如果進程排隊等候IPC信號，則離開隊列
調用_exit_files()、_exit_fs()、_exit_namespce()和exit_sighand()，分別遞減文件描述符、文件系統數據、進程名字空間和信號處理函數的引用計數。如果其中某些引用計數的值降為0，那么久代表沒有進程在使用相應的資源，此時可以釋放。
接著把存放在task_struct的exit_code成員中的任務退出碼置為exit()提供的代碼中，或者去完成任何其他由內核機制規定的退出動作
調用exit_notify()向父進程發送信號，將子進程的父進程重新設置為線程組中的其他線程或init進程，并把進程狀態設置為TASK_ZOMBLE
最后，do_exit()調用schedule()切換到其他進程。

至此，與進程相關聯的所有資源都被釋放掉了，只是相關聯的資源被釋放了，進程還有資源沒有被釋放，它剩下所占用的所有資源就是內核棧、thread_info結構和task_struct結構。此時進程存在的唯一目的就是向它的父進程提供信息。父進程檢索到信息后，或者通知內核那是無關的信息后，由進程所持有的剩余內存被釋放，歸還給系統使用。

刪除進程描述符

在調用do_exit()之后，盡管線程已經僵死不能再運行了，但是系統還保留了它的進程描述符。這樣做可以讓系統有辦法在子進程終結后仍能獲得它的信息。
當最終需要釋放進程描述符時，release_task()會被調用，用于完成以下工作：

首先，它調用free_uid()來減少該進程擁有者的進程使用計數。
然后，release_task()調用unhash_process()從pidhash上刪除該進程，同時也要從task_list中刪除該進程
接下來，如果這個進程正在被普通race跟蹤，release_task()將跟蹤進程的父進程重設為最初的父進程并將它從ptrace list上刪除。
最后，release_task()調用put_task_struct()釋放該進程內核棧和thread_info結構所占的頁，并釋放task_struct所占的深藍高速緩存、

至此，進程描述符和所有進程獨享的資源就全部釋放掉了。

孤兒進程造成的進退微谷

如果父進程在子進程之前退出，那么必須有機制來保證子進程能找到一個新的父親，否則的話這些成為孤兒的進程就會在退出時永遠處于僵死狀態，白白的耗費內存。解決方案：子進程在當前線程組（父進程所在的線程組）內找一個線程作為父親，如果不行，就讓init作為它們的父進程。在do_exit()中會調用notify_parent()，該函數會通過forget_original_parent()來執行尋父過程。

static inline void forget_original_parent(struct task_struct * father,struct list_head *to_release)
{struct task_struct *p, *reaper = father;struct list_head *_p, *_n;do {reaper = next_thread(reaper);if (reaper == father) {reaper = child_reaper;break;}} while (reaper->exit_state >= EXIT_ZOMBIE);/** There are only two places where our children can be:** - in our child list* - in our ptraced child list** Search them and reparent children.*/list_for_each_safe(_p, _n, &father->children) {int ptrace;p = list_entry(_p,struct task_struct,sibling);ptrace = p->ptrace;/* if father isn't the real parent, then ptrace must be enabled */BUG_ON(father != p->real_parent && !ptrace);if (father == p->real_parent) {/* reparent with a reaper, real father it's us */choose_new_parent(p, reaper, child_reaper);reparent_thread(p, father, 0);} else {/* reparent ptraced task to its real parent */__ptrace_unlink (p);if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&thread_group_empty(p))do_notify_parent(p, p->exit_signal);}/** if the ptraced child is a zombie with exit_signal == -1* we must collect it before we exit, or it will remain* zombie forever since we prevented it from self-reap itself* while it was being traced by us, to be able to see it in wait4.*/if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))list_add(&p->ptrace_list, to_release);}list_for_each_safe(_p, _n, &father->ptrace_children) {p = list_entry(_p,struct task_struct,ptrace_list);choose_new_parent(p, reaper, child_reaper);reparent_thread(p, father, 1);}
}

先在線程組里找一個線程作為父進程，如果線程組內沒有其他的進程，就將init設為父進程。當合適的父進程找到后，只需要遍歷所有子進程并為它們設置新的父進程。

后面遍歷了兩個鏈表：子進程鏈表和ptrace子進程鏈表。給每個子進程設置新的父進程。當一個進程被跟蹤時，它被暫時設定為調試進程的子進程。此時如果它的父進程退出了，系統會為它和它所有的兄弟重新找一個父進程。在以前的內核中，這就需要遍歷系統所有的進程來找這些子進程，現在的解決辦法是在一個單獨的ptrace跟蹤的子進程鏈表中搜索相關的兄弟進程，用兩個相關鏈表減輕了遍歷帶來的消耗。

5 小結

在本文中，我們討論進程的一般特效，它為何如此重要，以及進程與線程之間的關系。然后，討論了Linux如何存放和表示進程（用task_struct和thread_info），如果創建進程（通過clone()和fork()）,如何把新的執行映像裝入到地址空間（通過exec()調用族），如何表示進程的層次關系，父進程又是如何收集后代的信息（通過wait()系統調用族），以及進程最終如何死亡（強制或自愿調用exit()）。