好久不更新,上個硬貨。腳本需提前準備宿主和rrna的bowtie2索引文件,原始數據的命名方式為{sample}_raw_1/2.fq.gz,保存有原始數據路徑的文件,保存樣品列表的文件。
最后打個廣告,歡迎畜牧學方向的研究生報考蘭州大學。
#!/bin/bash
# 腳本功能:二代測序數據質控、去rRNA與去宿主序列處理
#版本1.2,新增了--rrna功能
usage() {echo "用法: $0 [--host <宿主類型>] [--rrna] <路徑文件> <樣品名列表文件>"exit 1
}# ---------- 配置路徑 ----------
declare -A HOST_INDEX=([sheep]="/mnt/data/database/bowtie/sheep/sheepgenome"[goat]="/mnt/data/database/bowtie/goat/goatgenome"[cattle]="/mnt/data/database/bowtie/cattle/cattlegenome"
)
SSU_INDEX="/mnt/data/database/bowtie/SSU/SSU"
LSU_INDEX="/mnt/data/database/bowtie/LSU/LSU"# 初始化變量
HOST=""
REMOVE_RRNA=false
INPUT_FILE=""
SAMPLE_LIST=""
# 增強參數解析
while [[ $# -gt 0 ]]; docase "$1" in--host)# 宿主類型驗證if [[ ! " sheep goat cattle " =~ " $2 " ]]; thenecho "錯誤:無效宿主類型 '$2',可選: sheep/goat/cattle"exit 1fiHOST="$2"shift 2 ;;--rrna)REMOVE_RRNA=trueshift ;;--help)usage ;;-*)echo "錯誤:未知選項 $1"usage ;;*)# 位置參數處理if [[ -z $INPUT_FILE ]]; thenINPUT_FILE="$1"elif [[ -z $SAMPLE_LIST ]]; thenSAMPLE_LIST="$1"elseecho "錯誤:多余參數 $1"usagefishift ;;esac
done# 驗證必要參數
if [[ -z $INPUT_FILE || -z $SAMPLE_LIST ]]; thenecho "錯誤:缺失必要參數!"usage
fi# 讀取原始路徑并驗證
INPUT_DIR=$(cat "$INPUT_FILE")
if [[ ! -d $INPUT_DIR ]]; thenecho "錯誤:路徑文件中的目錄 $INPUT_DIR 不存在"exit 1
fi# 宿主索引驗證
if [[ -n $HOST ]]; thenINDEX="${HOST_INDEX[$HOST]}"if [[ ! -f "${INDEX}.1.bt2" ]]; thenecho "錯誤:宿主索引文件 ${INDEX}.1.bt2 不存在"exit 1fi
fi# ---------- 目錄創建 ----------
CLEAN_DIR=$(dirname "$INPUT_DIR")/cleandata
mkdir -p "$CLEAN_DIR" || { echo "錯誤:無法創建目錄 $CLEAN_DIR"; exit 1; }# ---------- 處理流程 ----------
process_sample() {local sample=$1local raw_1="${INPUT_DIR}/${sample}_raw_1.fq.gz"local raw_2="${INPUT_DIR}/${sample}_raw_2.fq.gz"local clip_1="${CLEAN_DIR}/${sample}.clip.1.fq.gz"local clip_2="${CLEAN_DIR}/${sample}.clip.2.fq.gz"# Step 1: 質控(fastp默認參數)echo "[$(date)] 處理樣品 $sample:質控中..."fastp -i "$raw_1" -I "$raw_2" -o "$clip_1" -O "$clip_2" || {echo "錯誤:fastp處理失敗!"exit 1}#Step 2: 去宿主if [[ -n $HOST ]]; thenecho "[$(date)] 去除宿主 $HOST 序列..."bowtie2 -p 192 -x "$INDEX" --quiet\-1 "$clip_1" -2 "$clip_2" \--un-conc-gz "${CLEAN_DIR}/${sample}.clean.fq.gz" \--al-conc-gz "${CLEAN_DIR}/${sample}.host.fq.gz" \-S /dev/null > "${CLEAN_DIR}/${sample}.bowtie2.log" || exit 1mv "${CLEAN_DIR}/${sample}.clean.fq.1.gz" "$clip_1"mv "${CLEAN_DIR}/${sample}.clean.fq.2.gz" "$clip_2"fi#Step 3: 去rRNAif $REMOVE_RRNA; thenecho "[$(date)] 去除rRNA序列(SSU+LSU)..."# SSU過濾bowtie2 -p 192 -x "$SSU_INDEX" --quiet\-1 "$clip_1" -2 "$clip_2" \--un-conc-gz "${CLEAN_DIR}/${sample}.clean.fq.gz" \--al-conc-gz "${CLEAN_DIR}/${sample}.SSU.fq.gz" \-S /dev/null > "${CLEAN_DIR}/${sample}.SSU.log" || exit 1mv "${CLEAN_DIR}/${sample}.clean.fq.1.gz" "$clip_1"mv "${CLEAN_DIR}/${sample}.clean.fq.2.gz" "$clip_2"# LSU過濾bowtie2 -p 192 -x "$LSU_INDEX" --quiet\-1 "$clip_1" -2 "$clip_2" \--un-conc-gz "${CLEAN_DIR}/${sample}.clean.fq.gz" \--al-conc-gz "${CLEAN_DIR}/${sample}.LSU.fq.gz" \-S /dev/null > "${CLEAN_DIR}/${sample}.LSU.log" || exit 1mv "${CLEAN_DIR}/${sample}.clean.fq.1.gz" "$clip_1"mv "${CLEAN_DIR}/${sample}.clean.fq.2.gz" "$clip_2"fi}# 遍歷樣品列表
while IFS= read -r sample; do[[ -z $sample ]] && continue # 跳過空行process_sample "$sample"
done < "$SAMPLE_LIST"mkdir ${CLEAN_DIR}/host ${CLEAN_DIR}/rRNA ${CLEAN_DIR}/log
mv ${CLEAN_DIR}/*host\.fq* ${CLEAN_DIR}/host/
mv ${CLEAN_DIR}/*SU\.fq* ${CLEAN_DIR}/rRNA/
mv ${CLEAN_DIR}/*log ${CLEAN_DIR}/log
echo "[$(date)] 處理完成!結果保存至:$CLEAN_DIR"