2.3 概率抽樣
一、簡單隨機抽樣
# 數據預處理
LoanStats3c = read.csv("D:/OneDrive - stu.fynu.edu.cn/大四上學期/ysq-大數據探索性分析/data/2數據集二:Loan Data--Lending Club/LoanStats3c/LoanStats3c.csv", header = TRUE, fill = TRUE, comment.char = "")
# str(LoanStats3c)
# 將有缺失值的列進行中位數填補(針對數值列)# 檢測列類型為數值的列,輸出列序號為列表
num_cols <- sapply(LoanStats3c, is.numeric)
num_col_indices <- which(num_cols)# 對每個數值列,如果存在缺失值則用中位數填補
for(i in num_col_indices) {col_name <- names(LoanStats3c)[i]na_count <- sum(is.na(LoanStats3c[[i]]))if(na_count > 0) {med <- median(LoanStats3c[[i]], na.rm = TRUE)if(is.na(med)) {# 全為 NA 的列,輸出提示并跳過message(sprintf("跳過列 %s (索引 %d):全為 NA,無法用中位數填補", col_name, i))} else {LoanStats3c[[i]][is.na(LoanStats3c[[i]])] <- medmessage(sprintf("列 %s (索引 %d):用中位數 %s 填補 %d 個缺失值", col_name, i, format(med), na_count))}}
}# 再次統計數值列的缺失值數(用于檢查)
na_summary <- sapply(LoanStats3c[num_col_indices], function(x) sum(is.na(x)))
print(na_summary)# 寫入數值列索引到csv文件,供后續分析使用
write.csv(num_col_indices, file = "D:/OneDrive - stu.fynu.edu.cn/大四上學期/ysq-大數據探索性分析/demo2/num_col_indices.csv", row.names = FALSE)
# 保存清洗后的數據集
write.csv(LoanStats3c, file = "D:/OneDrive - stu.fynu.edu.cn/大四上學期/ysq-大數據探索性分析/demo2/LoanStats3c_imputed.csv", row.names = FALSE)# str(LoanStats3c) // 可取消注釋查看數據結構
列 member_id (索引 2):用中位數 22953173 填補 4 個缺失值列 loan_amnt (索引 3):用中位數 13000 填補 4 個缺失值列 funded_amnt (索引 4):用中位數 13000 填補 4 個缺失值列 loan_amnt (索引 3):用中位數 13000 填補 4 個缺失值列 funded_amnt (索引 4):用中位數 13000 填補 4 個缺失值列 funded_amnt_inv (索引 5):用中位數 13000 填補 4 個缺失值列 installment (索引 8):用中位數 384.14 填補 4 個缺失值列 funded_amnt_inv (索引 5):用中位數 13000 填補 4 個缺失值列 installment (索引 8):用中位數 384.14 填補 4 個缺失值列 annual_inc (索引 14):用中位數 65000 填補 4 個缺失值列 dti (索引 25):用中位數 17.63 填補 4 個缺失值列 delinq_2yrs (索引 26):用中位數 0 填補 4 個缺失值列 annual_inc (索引 14):用中位數 65000 填補 4 個缺失值列 dti (索引 25):用中位數 17.63 填補 4 個缺失值列 delinq_2yrs (索引 26):用中位數 0 填補 4 個缺失值列 inq_last_6mths (索引 28):用中位數 0 填補 4 個缺失值列 mths_since_last_delinq (索引 29):用中位數 30 填補 115885 個缺失值列 mths_since_last_record (索引 30):用中位數 69 填補 194109 個缺失值列 inq_last_6mths (索引 28):用中位數 0 填補 4 個缺失值列 mths_since_last_delinq (索引 29):用中位數 30 填補 115885 個缺失值列 mths_since_last_record (索引 30):用中位數 69 填補 194109 個缺失值列 open_acc (索引 31):用中位數 11 填補 4 個缺失值列 open_acc (索引 31):用中位數 11 填補 4 個缺失值列 pub_rec (索引 32):用中位數 0 填補 4 個缺失值列 revol_bal (索引 33):用中位數 11686 填補 4 個缺失值列 pub_rec (索引 32):用中位數 0 填補 4 個缺失值列 revol_bal (索引 33):用中位數 11686 填補 4 個缺失值列 total_acc (索引 35):用中位數 24 填補 4 個缺失值列 out_prncp (索引 37):用中位數 9823.83 填補 4 個缺失值列 total_acc (索引 35):用中位數 24 填補 4 個缺失值列 out_prncp (索引 37):用中位數 9823.83 填補 4 個缺失值列 out_prncp_inv (索引 38):用中位數 9817.7 填補 4 個缺失值列 total_pymnt (索引 39):用中位數 3478.72 填補 4 個缺失值列 out_prncp_inv (索引 38):用中位數 9817.7 填補 4 個缺失值列 total_pymnt (索引 39):用中位數 3478.72 填補 4 個缺失值列 total_pymnt_inv (索引 40):用中位數 3478.08 填補 4 個缺失值列 total_rec_prncp (索引 41):用中位數 2152.3 填補 4 個缺失值列 total_rec_int (索引 42):用中位數 995.42 填補 4 個缺失值列 total_pymnt_inv (索引 40):用中位數 3478.08 填補 4 個缺失值列 total_rec_prncp (索引 41):用中位數 2152.3 填補 4 個缺失值列 total_rec_int (索引 42):用中位數 995.42 填補 4 個缺失值列 total_rec_late_fee (索引 43):用中位數 0 填補 4 個缺失值列 total_rec_late_fee (索引 43):用中位數 0 填補 4 個缺失值列 recoveries (索引 44):用中位數 0 填補 4 個缺失值列 collection_recovery_fee (索引 45):用中位數 0 填補 4 個缺失值列 last_pymnt_amnt (索引 47):用中位數 420.64 填補 4 個缺失值列 recoveries (索引 44):用中位數 0 填補 4 個缺失值列 collection_recovery_fee (索引 45):用中位數 0 填補 4 個缺失值列 last_pymnt_amnt (索引 47):用中位數 420.64 填補 4 個缺失值列 collections_12_mths_ex_med (索引 50):用中位數 0 填補 4 個缺失值列 mths_since_last_major_derog (索引 51):用中位數 43 填補 169155 個缺失值列 policy_code (索引 52):用中位數 1 填補 4 個缺失值列 collections_12_mths_ex_med (索引 50):用中位數 0 填補 4 個缺失值列 mths_since_last_major_derog (索引 51):用中位數 43 填補 169155 個缺失值列 policy_code (索引 52):用中位數 1 填補 4 個缺失值member_id loan_amnt 0 0 funded_amnt funded_amnt_inv 0 0 installment annual_inc 0 0 dti delinq_2yrs 0 0 inq_last_6mths mths_since_last_delinq 0 0 mths_since_last_record open_acc 0 0 pub_rec revol_bal 0 0 total_acc out_prncp 0 0 out_prncp_inv total_pymnt 0 0 total_pymnt_inv total_rec_prncp 0 0 total_rec_int total_rec_late_fee 0 0 recoveries collection_recovery_fee 0 0 last_pymnt_amnt collections_12_mths_ex_med 0 0
mths_since_last_major_derog policy_code 0 0
# 推薦使用read.csv,并加上fill=TRUE和comment.char=""參數,避免列數不一致報錯
LoanStats3c = read.csv("D:/OneDrive - stu.fynu.edu.cn/大四上學期/ysq-大數據探索性分析/demo2/LoanStats3c_imputed.csv", header = TRUE, fill = TRUE, comment.char = "")
names(LoanStats3c) #數據中所有變量名
library(sampling) #抽樣函數所在的包
N=dim(LoanStats3c)[1] #總體數量
n=500 #樣本數量
srsp=srswor(n,N) #簡單隨機抽樣
srs=getdata(LoanStats3c,srsp) #得到樣本# str(srs)
# 檢測列類型為數值的列,輸出列序號為列表
new_num_cols <- sapply(srs, is.numeric)
new_num_col_indices <- which(new_num_cols)
# 輸出數值列的序號
# print(new_num_col_indices)
'id' 'member_id' 'loan_amnt' 'funded_amnt' 'funded_amnt_inv' 'term' 'int_rate' 'installment' 'grade' 'sub_grade' 'emp_title' 'emp_length' 'home_ownership' 'annual_inc' 'verification_status' 'issue_d' 'loan_status' 'pymnt_plan' 'url' 'desc' 'purpose' 'title' 'zip_code' 'addr_state' 'dti' 'delinq_2yrs' 'earliest_cr_line' 'inq_last_6mths' 'mths_since_last_delinq' 'mths_since_last_record' 'open_acc' 'pub_rec' 'revol_bal' 'revol_util' 'total_acc' 'initial_list_status' 'out_prncp' 'out_prncp_inv' 'total_pymnt' 'total_pymnt_inv' 'total_rec_prncp' 'total_rec_int' 'total_rec_late_fee' 'recoveries' 'collection_recovery_fee' 'last_pymnt_d' 'last_pymnt_amnt' 'next_pymnt_d' 'last_credit_pull_d' 'collections_12_mths_ex_med' 'mths_since_last_major_derog' 'policy_code'
# 簡單估計比較樣本均值和總體均值
meanY = colMeans(LoanStats3c[, num_col_indices], na.rm = TRUE) # 總體均值
meany = colMeans(srs[, new_num_col_indices], na.rm = TRUE) # 樣本均值
# 輸出對象長度并檢查匹配
cat('length(meanY) =', length(meanY), '\n')
cat('length(meany) =', length(meany), '\n')
if(length(meanY) != length(meany)) {warning('總體均值和樣本均值長度不一致:嘗試按共有變量對齊')common_names <- intersect(names(meanY), names(meany))meanY <- meanY[common_names]meany <- meany[common_names]cat('對齊后長度 =', length(meanY), '\n')
}
# 計算差值并顯示前幾項
md = meanY - meany
print(head(md))
length(meanY) = 28
length(meany) = 29
length(meany) = 29 Warning message:
"總體均值和樣本均值長度不一致:嘗試按共有變量對齊"對齊后長度 = 28 member_id loan_amnt funded_amnt funded_amnt_inv installment -411336.24337 -518.87495 -518.87495 -519.04750 -10.08388 annual_inc -1301.80738 member_id loan_amnt funded_amnt funded_amnt_inv installment -411336.24337 -518.87495 -518.87495 -519.04750 -10.08388 annual_inc -1301.80738
二、分層隨機抽樣
# 確認存在 grade 列
if(!"grade" %in% names(LoanStats3c)) {stop("數據集中不存在名為 'grade' 的列,請檢查變量名(區分大小寫)")
}
# 清洗 grade 字段:去除首尾空白并統一為大寫,避免 ' A' 或 'a' 等問題
LoanStats3c$grade <- trimws(as.character(LoanStats3c$grade))
LoanStats3c$grade <- toupper(LoanStats3c$grade)
# 將 grade 轉為因子并指定 A-F 順序(如只存在部分等級,factor 會自動處理)
LoanStats3c$grade <- factor(LoanStats3c$grade, levels = c("A","B","C","D","E","F"))
# 去掉 grade 為 NA 的行(否則會導致 strata 的 size 對應出現 NA)
df_nomiss_grade <- LoanStats3c[!is.na(LoanStats3c$grade), ]
if(nrow(df_nomiss_grade) == 0) stop('去除 NA 后沒有可用于分層的數據')
# 計算各等級總量(基于去除 NA 的數據)
grade_counts <- table(df_nomiss_grade$grade)
# 僅保留數據中實際存在的等級(count > 0)
present_levels <- names(grade_counts[grade_counts > 0])
if(length(present_levels) == 0) stop('數據中沒有可用的 grade 等級')
counts_present <- as.integer(grade_counts[present_levels])
names(counts_present) <- present_levels
# 若請求樣本量 n 大于總體容量,調整 n
total_capacity <- sum(counts_present)
if(n > total_capacity) {warning(sprintf('請求樣本量 n=%d 大于總體容量 %d,已將 n 調整為 %d', n, total_capacity, total_capacity))n <- total_capacity
}
# 按比例初始分配(僅對存在的等級)
prop_present <- counts_present / sum(counts_present)
wh_present <- as.integer(round(n * prop_present))
# 確保每層不超過該層容量,若超過則先截斷
over_idx <- which(wh_present > counts_present)
if(length(over_idx) > 0) {wh_present[over_idx] <- counts_present[over_idx]
}
# 調整使總和等于 n:若不足則在有剩余容量的層中循環分配
alloc_sum <- sum(wh_present)
diff <- n - alloc_sum
if(diff > 0) {# 逐步分配剩余名額到還有容量的層,按容量從大到小循環分配avail <- counts_present - wh_presentorder_idx <- order(avail, decreasing = TRUE)i <- 1while(diff > 0 && sum(avail) > 0) {idx <- order_idx[((i - 1) %% length(order_idx)) + 1]if(avail[idx] > 0) {wh_present[idx] <- wh_present[idx] + 1avail[idx] <- avail[idx] - 1diff <- diff - 1}i <- i + 1}
}
# 若分配過多(diff < 0),從有多余的層中減少
if(diff < 0) {# 從當前分配量最多的層開始減少,直到匹配order_idx <- order(wh_present, decreasing = TRUE)i <- 1while(diff < 0) {idx <- order_idx[((i - 1) %% length(order_idx)) + 1]if(wh_present[idx] > 0) {wh_present[idx] <- wh_present[idx] - 1diff <- diff + 1}i <- i + 1}
}
names(wh_present) <- present_levels
message('各層目標樣本量(僅列出存在的等級):')
print(wh_present)
# 為 strata 準備數據:按 grade 排序(strata 內部以出現順序匹配 size)
o <- order(df_nomiss_grade$grade)
data_o <- df_nomiss_grade[o, ]
# 確保 size 的順序與 data 中實際出現的等級順序一致
data_levels_in_order <- unique(as.character(data_o$grade))
message('data 中實際出現的等級順序:')
print(data_levels_in_order)
message('wh_present 的名字:')
print(names(wh_present))
# 從 wh_present 中按 data_levels_in_order 提取樣本量(使用安全映射),并確保沒有 NA,轉換為整數
size_for_strata <- sapply(data_levels_in_order, function(l) {if(!is.na(l) && l %in% names(wh_present)) as.integer(wh_present[[l]]) else 0
})
names(size_for_strata) <- data_levels_in_order
# 計算每層實際容量(按 data_o 中的計數),使用 sapply 避免 NA
counts_in_order <- sapply(data_levels_in_order, function(l) sum(data_o$grade == l, na.rm = TRUE))
# 再次確保 size_for_strata 不超過每層實際容量
size_for_strata <- pmin(as.integer(size_for_strata), as.integer(counts_in_order))
message('傳遞給 strata 的 size 向量(按 data 中等級順序):')
print(size_for_strata)
message('對應每層的容量(counts_in_order):')
print(counts_in_order)
# 若全部為 0,則停止
if(all(size_for_strata == 0)) stop('分配到各層的樣本數均為 0,無法抽樣,請檢查 n 的值或 grade 分布')
# 調用 strata 進行分層無放回抽樣
srp <- strata(data = data_o, stratanames = "grade", size = size_for_strata, method = "srswor")
# 提取樣本并恢復原始順序(可選)
# sr 使用原始數據框和 srp 索引來獲取對應行
sr <- getdata(LoanStats3c, srp)
srs <- sr
# 更新數值列索引(供后續使用)
new_num_cols <- sapply(srs, is.numeric)
new_num_col_indices <- which(new_num_cols)
message(sprintf("分層抽樣完成,樣本行數 = %d", nrow(srs)))
# 查看各層實際入選數量
print(table(srs$grade))
# 輸出入選樣本前幾行
print(head(srs))
# 可選:保存分層樣本
write.csv(srs, file = "D:/OneDrive - stu.fynu.edu.cn/大四上學期/ysq-大數據探索性分析/demo2/srs_by_grade.csv", row.names = FALSE)
各層目標樣本量(僅列出存在的等級):A B C D E F 77 132 143 92 43 13 data 中實際出現的等級順序:[1] "A" "B" "C" "D" "E" "F"wh_present 的名字:[1] "A" "B" "C" "D" "E" "F"傳遞給 strata 的 size 向量(按 data 中等級順序):[1] 77 132 143 92 43 13對應每層的容量(counts_in_order):A B C D E F
36108 61935 66565 42992 20121 6223 分層抽樣完成,樣本行數 = 500A B C D E F 77 132 143 92 43 13 id member_id loan_amnt funded_amnt funded_amnt_inv term
544 36019516 38721136 10000 10000 10000 36 months
560 37690957 40463819 32000 32000 32000 36 months
676 37791309 40554270 8400 8400 8400 36 months
747 37840891 40603766 28000 28000 28000 60 months
965 36733440 39476198 8000 8000 8000 36 months
1514 37840801 40603650 4500 4500 4500 36 monthsint_rate installment sub_grade emp_title emp_length
544 7.49% 311.02 A4 owner 8 years
560 12.39% 1068.83 C1 RN CASE MANAGER 3 years
676 14.99% 291.15 C5 Merchandising Manager 8 years
747 9.49% 587.92 B2 System Administrator 10+ years
965 7.49% 248.82 A4 Controller 3 years
1514 12.39% 150.31 C1 OPERATIONS MANAGER 8 yearshome_ownership annual_inc verification_status issue_d loan_status
544 MORTGAGE 225000 VERIFIED - income source Dec-14 Current
560 OWN 70000 VERIFIED - income Dec-14 Current
676 OWN 34750 VERIFIED - income source Dec-14 Current
747 MORTGAGE 125000 VERIFIED - income source Dec-14 Current
965 RENT 140000 VERIFIED - income source Dec-14 Current
1514 RENT 40000 not verified Dec-14 Currentpymnt_plan
544 n
560 n
676 n
747 n
965 n
1514 nurl desc
544 https://www.lendingclub.com/browse/loanDetail.action?loan_id=36019516
560 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37690957
676 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37791309
747 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37840891
965 https://www.lendingclub.com/browse/loanDetail.action?loan_id=36733440
1514 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37840801 purpose title zip_code addr_state dti
544 home_improvement Home improvement 370xx TN 7.79
560 debt_consolidation Debt consolidation 769xx TX 17.75
676 debt_consolidation Debt consolidation 840xx UT 25.15
747 credit_card Credit card refinancing 103xx NY 21.00
965 debt_consolidation Debt consolidation 864xx AZ 4.78
1514 credit_card Credit card refinancing 072xx NJ 18.67delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
544 0 Sep-95 0 76
560 0 Dec-99 0 30
676 0 Jan-90 1 30
747 0 Dec-85 0 30
965 0 Sep-99 1 26
1514 0 Mar-05 0 30mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
544 75 11 1 25176 26.20% 23
560 69 10 0 25295 45.70% 20
676 69 9 0 11666 81.60% 10
747 69 23 0 93879 56.40% 44
965 69 8 0 13018 64.80% 28
1514 69 7 0 5733 56.80% 11initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
544 f 8996.24 8996.24 1237.84 1237.84
560 f 29000.22 29000.22 4242.28 4242.28
676 f 7834.33 7834.33 862.96 862.96
747 f 26516.57 26516.57 2329.54 2329.54
965 w 7196.97 7196.97 991.95 991.95
1514 f 4078.13 4078.13 598.14 598.14total_rec_prncp total_rec_int total_rec_late_fee recoveries
544 1003.76 234.08 0 0
560 2999.78 1242.50 0 0
676 565.67 297.29 0 0
747 1483.43 846.11 0 0
965 803.03 188.92 0 0
1514 421.87 176.27 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
544 0 Apr-15 311.02 May-15
560 0 Apr-15 1068.83 May-15
676 0 Apr-15 291.15 May-15
747 0 Apr-15 587.92 May-15
965 0 Apr-15 248.82 May-15
1514 0 Apr-15 150.31 May-15last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
544 Apr-15 0 76
560 Apr-15 0 43
676 Apr-15 0 43
747 Apr-15 0 43
965 Apr-15 0 28
1514 Apr-15 0 43policy_code grade ID_unit Prob Stratum
544 1 A 544 0.002132491 1
560 1 A 560 0.002132491 1
676 1 A 676 0.002132491 1
747 1 A 747 0.002132491 1
965 1 A 965 0.002132491 1
1514 1 A 1514 0.002132491 1
三、整群抽樣
#—————— 整群抽樣(穩健實現)——————#
cluster_col <- "home_ownership"
if(!cluster_col %in% names(LoanStats3c)) {stop(sprintf("找不到列 %s,請檢查變量名", cluster_col))
}
# 計算可用簇(去除 NA)
clusters <- unique(na.omit(LoanStats3c[[cluster_col]]))
n_clusters <- length(clusters)
requested_clusters <- 10 # 希望抽取的簇數,可修改
if(n_clusters == 0) stop(sprintf("列 %s 沒有可用的簇(全部為 NA)", cluster_col))
# 將請求的簇數限制為實際可用簇數
size_clusters <- min(requested_clusters, n_clusters)
message(sprintf("簇總數 = %d, 請求 = %d, 實際將抽取 = %d", n_clusters, requested_clusters, size_clusters))
# 如果 size_clusters 為 0 則無法抽樣
if(size_clusters <= 0) stop("沒有可用的簇可抽取")
# 調用 cluster 函數進行抽樣(無放回)
scp <- cluster(data = LoanStats3c, clustername = cluster_col, size = size_clusters, method = "srswor", description = FALSE)
sc <- getdata(LoanStats3c, scp)
# 查看抽取到的簇和樣本數量
message(sprintf("抽到的簇數量 = %d, 抽樣得到的行數 = %d", length(unique(scp$ID_1)), nrow(sc)))
print(head(sc))
簇總數 = 5, 請求 = 10, 實際將抽取 = 5抽到的簇數量 = 0, 抽樣得到的行數 = 235633抽到的簇數量 = 0, 抽樣得到的行數 = 235633id member_id loan_amnt
235630 22953173 13000
235631 22953173 13000
235632 Total amount funded in policy code 1: 3503840175 22953173 13000
235633 Total amount funded in policy code 2: 873663239 22953173 13000
80686 26170263 28642950 5000
16527 35226318 37907692 19200funded_amnt funded_amnt_inv term int_rate installment grade
235630 13000 13000 384.14 <NA>
235631 13000 13000 384.14 <NA>
235632 13000 13000 384.14 <NA>
235633 13000 13000 384.14 <NA>
80686 5000 5000 36 months 11.67% 165.29 B
16527 19200 19200 36 months 9.49% 614.95 Bsub_grade emp_title emp_length annual_inc
235630 65000
235631 65000
235632 65000
235633 65000
80686 B4 Office Administrative Assistant 5 years 35680
16527 B2 Controller 10+ years 38400verification_status issue_d loan_status pymnt_plan
235630
235631
235632
235633
80686 VERIFIED - income source Sep-14 Fully Paid n
16527 not verified Nov-14 Current nurl
235630
235631
235632
235633
80686 https://www.lendingclub.com/browse/loanDetail.action?loan_id=26170263
16527 https://www.lendingclub.com/browse/loanDetail.action?loan_id=35226318desc purpose title zip_code addr_state
235630
235631
235632
235633
80686 debt_consolidation Debt consolidation 757xx TX
16527 credit_card Credit card refinancing 476xx INdti delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
235630 17.63 0 0 30
235631 17.63 0 0 30
235632 17.63 0 0 30
235633 17.63 0 0 30
80686 28.12 0 Dec-03 0 55
16527 28.94 0 Oct-96 0 27mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
235630 69 11 0 11686 24
235631 69 11 0 11686 24
235632 69 11 0 11686 24
235633 69 11 0 11686 24
80686 69 7 0 3319 43.10% 12
16527 69 9 0 19353 64.50% 30initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
235630 9823.83 9817.70 3478.720 3478.08
235631 9823.83 9817.70 3478.720 3478.08
235632 9823.83 9817.70 3478.720 3478.08
235633 9823.83 9817.70 3478.720 3478.08
80686 f 0.00 0.00 5048.625 5048.62
16527 w 16847.53 16847.53 3064.630 3064.63total_rec_prncp total_rec_int total_rec_late_fee recoveries
235630 2152.30 995.42 0 0
235631 2152.30 995.42 0 0
235632 2152.30 995.42 0 0
235633 2152.30 995.42 0 0
80686 5000.00 48.62 0 0
16527 2352.47 712.16 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
235630 0 420.64
235631 0 420.64
235632 0 420.64
235633 0 420.64
80686 0 Oct-14 5048.63
16527 0 Apr-15 614.95 May-15last_credit_pull_d collections_12_mths_ex_med
235630 0
235631 0
235632 0
235633 0
80686 Apr-15 0
16527 Apr-15 0mths_since_last_major_derog policy_code home_ownership ID_unit Prob
235630 43 1 235630 1
235631 43 1 235631 1
235632 43 1 235632 1
235633 43 1 235633 1
80686 55 1 ANY 80686 1
16527 31 1 MORTGAGE 16527 1
四、系統抽樣
i=rep(1,N)
pik1=inclusionprobabilities(i,n) #采用等概率的系統抽樣,共抽取n個樣本
ssp=UPsystematic(pik1,eps=1e-6) #系統抽樣的函數
ss=getdata(LoanStats3c,ssp) #入選樣本
print(head(ss))
ID_unit id member_id loan_amnt funded_amnt funded_amnt_inv
283 283 37751794 40514790 12000 12000 12000
755 755 37741264 40504192 7000 7000 7000
1226 1226 37740968 40503825 3600 3600 3600
1697 1697 37700609 40473393 16000 16000 16000
2168 2168 37760342 40523083 22525 22525 22525
2640 2640 37257945 40030762 25000 25000 25000term int_rate installment grade sub_grade emp_title
283 36 months 11.99% 398.52 B B5 Office Administrator
755 36 months 6.49% 214.52 A A2 Associate Engineer
1226 36 months 13.66% 122.45 C C3 Software Engineer
1697 36 months 9.49% 512.46 B B2 Manager
2168 60 months 19.24% 587.29 E E2 Senior Typist
2640 60 months 16.49% 614.48 D D3 Parole Officeremp_length home_ownership annual_inc verification_status issue_d
283 7 years RENT 68200 VERIFIED - income source Dec-14
755 < 1 year RENT 65000 not verified Dec-14
1226 < 1 year RENT 67000 not verified Dec-14
1697 3 years MORTGAGE 90000 not verified Dec-14
2168 10+ years RENT 49000 VERIFIED - income Dec-14
2640 2 years RENT 60000 VERIFIED - income Dec-14loan_status pymnt_plan
283 Current n
755 Current n
1226 Current n
1697 Current n
2168 Current n
2640 Current nurl desc
283 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37751794
755 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37741264
1226 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37740968
1697 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37700609
2168 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37760342
2640 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37257945 purpose title zip_code addr_state dti
283 other Other 944xx CA 17.75
755 debt_consolidation Debt consolidation 100xx NY 7.94
1226 credit_card Credit card refinancing 276xx NC 21.73
1697 debt_consolidation Debt consolidation 982xx WA 25.92
2168 credit_card Credit card refinancing 120xx NY 33.60
2640 debt_consolidation Debt consolidation 995xx AK 38.66delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
283 0 Jun-93 1 30
755 0 Feb-01 0 30
1226 0 Feb-89 0 29
1697 0 May-05 2 30
2168 0 May-94 0 30
2640 0 Aug-01 1 30mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
283 69 8 0 17695 78.60% 14
755 69 12 0 10294 38.30% 39
1226 69 12 0 7191 53.30% 14
1697 69 16 0 13537 42.40% 29
2168 69 12 0 28745 96.50% 18
2640 69 17 0 16058 49.90% 54initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
283 f 10868.71 10868.71 1578.09 1578.09
755 f 6287.60 6287.60 854.29 854.29
1226 f 3268.51 3268.51 484.34 484.34
1697 f 14437.88 14437.88 2032.97 2032.97
2168 w 21598.46 21598.46 2325.08 2325.08
2640 w 23893.70 23893.70 2435.02 2435.02total_rec_prncp total_rec_int total_rec_late_fee recoveries
283 1131.29 446.80 0 0
755 712.40 141.89 0 0
1226 331.49 152.85 0 0
1697 1562.12 470.85 0 0
2168 926.54 1398.54 0 0
2640 1106.30 1328.72 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
283 0 Apr-15 398.52 May-15
755 0 Apr-15 214.52 May-15
1226 0 Apr-15 122.45 May-15
1697 0 Apr-15 512.46 May-15
2168 0 Apr-15 587.29 May-15
2640 0 Apr-15 614.48 May-15last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
283 Apr-15 0 43
755 Apr-15 0 43
1226 Apr-15 0 43
1697 Apr-15 0 43
2168 Apr-15 0 43
2640 Apr-15 0 43policy_code
283 1
755 1
1226 1
1697 1
2168 1
2640 1
五、多階段兩階段抽樣
#—————— 兩階段抽樣(手工實現,替代 mstage)——————#
# 說明:先抽取若干主簇(cluster),然后在每個被抽中的簇內抽取二級樣本。
# 參數(可調整)
primary_cluster_col <- "home_ownership" # 主簇變量名(請根據數據替換)
primary_k <- 3 # 希望抽取的主簇數
# 二級樣本總目標為 n(之前定義),按主簇均分
secondary_total <- n
# 清洗并驗證主簇列
if(!primary_cluster_col %in% names(LoanStats3c)) stop(sprintf("找不到列 %s,請檢查字段名", primary_cluster_col))
LoanStats3c[[primary_cluster_col]] <- trimws(as.character(LoanStats3c[[primary_cluster_col]]))
LoanStats3c[[primary_cluster_col]] <- ifelse(LoanStats3c[[primary_cluster_col]]=="", NA, LoanStats3c[[primary_cluster_col]])
clusters <- unique(na.omit(LoanStats3c[[primary_cluster_col]]))
n_clusters <- length(clusters)
if(n_clusters == 0) stop(sprintf("列 %s 沒有可用的簇(全部為 NA 或空字符串)", primary_cluster_col))
# 調整主簇數量不超過可用簇數
primary_k <- min(primary_k, n_clusters)
message(sprintf("可用簇數量 = %d,計劃抽取主簇 = %d", n_clusters, primary_k))
# 隨機抽取主簇(無放回)
set.seed(123) # 可選固定隨機種子,便于重現
primary_selected <- sample(clusters, primary_k)
message('抽中的主簇:')
print(primary_selected)
# 為每個主簇計算二級樣本目標(均分 + 余數分配)
base_sec <- floor(secondary_total / primary_k)
sec_sizes <- rep(base_sec, primary_k)
rem <- secondary_total - sum(sec_sizes)
if(rem > 0) {for(i in seq_len(rem)) sec_sizes[i] <- sec_sizes[i] + 1
}
names(sec_sizes) <- primary_selected
message('每個被抽中簇的目標二級樣本數(可能被截斷至簇容量):')
print(sec_sizes)
# 在每個被抽中簇內抽樣
sampled_rows <- integer(0)
per_cluster_actual <- integer(length(primary_selected))
for(i in seq_along(primary_selected)){cl <- primary_selected[i]rows_in_cl <- which(LoanStats3c[[primary_cluster_col]] == cl)cap <- length(rows_in_cl)target <- sec_sizes[i]if(cap == 0) {per_cluster_actual[i] <- 0next}take <- min(target, cap)# 若需要全部抽取可直接取全部chosen <- sample(rows_in_cl, take, replace = FALSE)sampled_rows <- c(sampled_rows, chosen)per_cluster_actual[i] <- length(chosen)
}
# 生成結果對象
mss <- LoanStats3c[sampled_rows, ]
ms <- list(primary_selected = primary_selected, per_cluster_target = sec_sizes, per_cluster_actual = per_cluster_actual)
message(sprintf('兩階段抽樣完成:共抽取 %d 行樣本', nrow(mss)))
message('各簇實際抽取數:')
print(setNames(per_cluster_actual, primary_selected))
# 保存或查看部分樣本
print(head(mss))
# 可選保存
# write.csv(mss, file = "D:/OneDrive - stu.fynu.edu.cn/大四上學期/ysq-大數據探索性分析/demo2/multistage_sample.csv", row.names = FALSE)
可用簇數量 = 4,計劃抽取主簇 = 3抽中的主簇:抽中的主簇:[1] "OWN" "ANY" "RENT"每個被抽中簇的目標二級樣本數(可能被截斷至簇容量):OWN ANY RENT 167 167 166 兩階段抽樣完成:共抽取 334 行樣本各簇實際抽取數:各簇實際抽取數:OWN ANY RENT 167 1 166 id member_id loan_amnt funded_amnt funded_amnt_inv term
28691 34442221 37105507 5000 5000 5000 36 months
17467 34874813 37558157 8200 8200 8200 36 months
32452 33230956 35874247 13050 13050 13050 60 months
114764 21370073 23682984 9300 9300 9300 36 months
45358 31367396 33940619 28000 28000 28000 60 months
65397 27650347 30153412 10000 10000 10000 36 monthsint_rate installment grade sub_grade emp_title emp_length
28691 11.99% 166.05 B B5 truck driver 7 years
17467 14.99% 284.22 C C5 flight attendant 1 year
32452 19.52% 342.27 E E2 Asst. Vice President 9 years
114764 16.99% 331.53 D D3 lab tech II 5 years
45358 13.35% 642.12 C C2 Staff Civil Engineer < 1 year
65397 14.99% 346.61 C C5 Computer Specialist 10+ yearshome_ownership annual_inc verification_status issue_d loan_status
28691 OWN 70000 not verified Nov-14 Current
17467 OWN 23000 VERIFIED - income source Nov-14 Current
32452 OWN 45000 VERIFIED - income source Nov-14 Current
114764 OWN 47544 VERIFIED - income source Jul-14 Current
45358 OWN 73000 VERIFIED - income source Oct-14 Current
65397 OWN 95000 VERIFIED - income source Oct-14 Currentpymnt_plan
28691 n
17467 n
32452 n
114764 n
45358 n
65397 nurl
28691 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34442221
17467 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34874813
32452 https://www.lendingclub.com/browse/loanDetail.action?loan_id=33230956
114764 https://www.lendingclub.com/browse/loanDetail.action?loan_id=21370073
45358 https://www.lendingclub.com/browse/loanDetail.action?loan_id=31367396
65397 https://www.lendingclub.com/browse/loanDetail.action?loan_id=27650347desc purpose title zip_code addr_state dti
28691 home_improvement Home improvement 604xx IL 20.69
17467 debt_consolidation Debt consolidation 410xx KY 34.13
32452 debt_consolidation Debt consolidation 330xx FL 32.75
114764 debt_consolidation Debt consolidation 631xx MO 18.60
45358 debt_consolidation Debt consolidation 193xx PA 11.61
65397 debt_consolidation Debt consolidation 114xx NY 4.90delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
28691 0 Jun-01 0 56
17467 1 Feb-99 3 17
32452 0 Oct-04 0 30
114764 0 Jun-96 1 30
45358 0 Feb-02 0 30
65397 4 Apr-93 0 1mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
28691 69 6 0 5057 84.30% 18
17467 69 9 0 10573 41.80% 16
32452 69 28 0 12713 88.90% 49
114764 60 18 1 7255 34.70% 51
45358 69 15 0 8689 25.70% 33
65397 69 10 0 4999 27.20% 45initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
28691 f 4407.83 4407.83 826.92 826.92
17467 w 7268.06 7268.06 1414.27 1414.27
32452 w 12378.56 12378.56 1697.20 1697.20
114764 w 7395.98 7395.98 2983.77 2983.77
45358 w 25960.28 25960.28 3831.95 3831.95
65397 w 8627.60 8627.60 2079.66 2079.66total_rec_prncp total_rec_int total_rec_late_fee recoveries
28691 592.17 234.75 0 0
17467 931.94 482.33 0 0
32452 671.44 1025.76 0 0
114764 1904.02 1079.75 0 0
45358 2039.72 1792.23 0 0
65397 1372.40 707.26 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
28691 0 Apr-15 166.05 May-15
17467 0 Apr-15 284.22 May-15
32452 0 Apr-15 342.27 May-15
114764 0 Apr-15 331.53 May-15
45358 0 Apr-15 642.12 May-15
65397 0 Apr-15 346.61 May-15last_credit_pull_d collections_12_mths_ex_med
28691 Apr-15 0
17467 Apr-15 0
32452 Apr-15 0
114764 Apr-15 0
45358 Apr-15 0
65397 Apr-15 0mths_since_last_major_derog policy_code
28691 43 1
17467 17 1
32452 43 1
114764 43 1
45358 43 1
65397 1 1id member_id loan_amnt funded_amnt funded_amnt_inv term
28691 34442221 37105507 5000 5000 5000 36 months
17467 34874813 37558157 8200 8200 8200 36 months
32452 33230956 35874247 13050 13050 13050 60 months
114764 21370073 23682984 9300 9300 9300 36 months
45358 31367396 33940619 28000 28000 28000 60 months
65397 27650347 30153412 10000 10000 10000 36 monthsint_rate installment grade sub_grade emp_title emp_length
28691 11.99% 166.05 B B5 truck driver 7 years
17467 14.99% 284.22 C C5 flight attendant 1 year
32452 19.52% 342.27 E E2 Asst. Vice President 9 years
114764 16.99% 331.53 D D3 lab tech II 5 years
45358 13.35% 642.12 C C2 Staff Civil Engineer < 1 year
65397 14.99% 346.61 C C5 Computer Specialist 10+ yearshome_ownership annual_inc verification_status issue_d loan_status
28691 OWN 70000 not verified Nov-14 Current
17467 OWN 23000 VERIFIED - income source Nov-14 Current
32452 OWN 45000 VERIFIED - income source Nov-14 Current
114764 OWN 47544 VERIFIED - income source Jul-14 Current
45358 OWN 73000 VERIFIED - income source Oct-14 Current
65397 OWN 95000 VERIFIED - income source Oct-14 Currentpymnt_plan
28691 n
17467 n
32452 n
114764 n
45358 n
65397 nurl
28691 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34442221
17467 https://www.lendingclub.com/browse/loanDetail.action?loan_id=34874813
32452 https://www.lendingclub.com/browse/loanDetail.action?loan_id=33230956
114764 https://www.lendingclub.com/browse/loanDetail.action?loan_id=21370073
45358 https://www.lendingclub.com/browse/loanDetail.action?loan_id=31367396
65397 https://www.lendingclub.com/browse/loanDetail.action?loan_id=27650347desc purpose title zip_code addr_state dti
28691 home_improvement Home improvement 604xx IL 20.69
17467 debt_consolidation Debt consolidation 410xx KY 34.13
32452 debt_consolidation Debt consolidation 330xx FL 32.75
114764 debt_consolidation Debt consolidation 631xx MO 18.60
45358 debt_consolidation Debt consolidation 193xx PA 11.61
65397 debt_consolidation Debt consolidation 114xx NY 4.90delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
28691 0 Jun-01 0 56
17467 1 Feb-99 3 17
32452 0 Oct-04 0 30
114764 0 Jun-96 1 30
45358 0 Feb-02 0 30
65397 4 Apr-93 0 1mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
28691 69 6 0 5057 84.30% 18
17467 69 9 0 10573 41.80% 16
32452 69 28 0 12713 88.90% 49
114764 60 18 1 7255 34.70% 51
45358 69 15 0 8689 25.70% 33
65397 69 10 0 4999 27.20% 45initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
28691 f 4407.83 4407.83 826.92 826.92
17467 w 7268.06 7268.06 1414.27 1414.27
32452 w 12378.56 12378.56 1697.20 1697.20
114764 w 7395.98 7395.98 2983.77 2983.77
45358 w 25960.28 25960.28 3831.95 3831.95
65397 w 8627.60 8627.60 2079.66 2079.66total_rec_prncp total_rec_int total_rec_late_fee recoveries
28691 592.17 234.75 0 0
17467 931.94 482.33 0 0
32452 671.44 1025.76 0 0
114764 1904.02 1079.75 0 0
45358 2039.72 1792.23 0 0
65397 1372.40 707.26 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
28691 0 Apr-15 166.05 May-15
17467 0 Apr-15 284.22 May-15
32452 0 Apr-15 342.27 May-15
114764 0 Apr-15 331.53 May-15
45358 0 Apr-15 642.12 May-15
65397 0 Apr-15 346.61 May-15last_credit_pull_d collections_12_mths_ex_med
28691 Apr-15 0
17467 Apr-15 0
32452 Apr-15 0
114764 Apr-15 0
45358 Apr-15 0
65397 Apr-15 0mths_since_last_major_derog policy_code
28691 43 1
17467 17 1
32452 43 1
114764 43 1
45358 43 1
65397 1 1
六、不等概抽樣
vol=LoanStats3c$total_acc
pik=inclusionprobabilities(vol,n) #以成交量為樣本規模求出包含概率
usp=UPmidzuno(pik) #用Midzuno方法進行不等概率抽樣
us=getdata(LoanStats3c,usp) #入選樣本
print(head(us))
ID_unit id member_id loan_amnt funded_amnt funded_amnt_inv
317 317 37800583 40563367 11875 11875 11875
880 880 37811222 40574134 3000 3000 3000
1187 1187 37661052 40423957 12300 12300 12300
1435 1435 37690890 40463735 25000 25000 25000
2732 2732 37098036 39860860 2000 2000 2000
3000 3000 37077808 39840608 14700 14700 14700term int_rate installment grade sub_grade
317 36 months 14.31% 407.65 C C4
880 36 months 14.99% 103.99 C C5
1187 36 months 6.49% 376.93 A A2
1435 36 months 6.03% 760.89 A A1
2732 36 months 12.99% 67.38 C C2
3000 36 months 14.99% 509.51 C C5emp_title emp_length home_ownership annual_inc
317 admission < 1 year OWN 55000.00
880 Contractor Installation Manager 3 years OWN 63000.00
1187 office manager < 1 year OWN 54000.00
1435 Director, Techincal Services 10+ years MORTGAGE 160000.00
2732 Client Response Communication 9 years MORTGAGE 26583.07
3000 Writer < 1 year RENT 100000.00verification_status issue_d loan_status pymnt_plan
317 not verified Dec-14 Current n
880 not verified Dec-14 Current n
1187 VERIFIED - income source Dec-14 Current n
1435 VERIFIED - income Dec-14 Current n
2732 not verified Dec-14 Current n
3000 not verified Dec-14 Current nurl desc
317 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37800583
880 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37811222
1187 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37661052
1435 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37690890
2732 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37098036
3000 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37077808 purpose title zip_code addr_state dti
317 credit_card Credit card refinancing 921xx CA 26.23
880 moving Moving and relocation 997xx AK 17.05
1187 debt_consolidation Debt consolidation 217xx MD 18.80
1435 credit_card Credit card refinancing 603xx IL 15.54
2732 other Other 844xx UT 11.87
3000 credit_card Credit card refinancing 100xx NY 8.07delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
317 0 Feb-02 1 30
880 0 Jul-99 1 41
1187 0 Jan-01 0 30
1435 0 Dec-94 0 30
2732 0 Dec-00 0 30
3000 0 Jan-66 0 40mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
317 86 28 1 19422 49.40% 46
880 69 20 0 3390 51.40% 52
1187 69 11 0 9839 32.50% 27
1435 69 9 0 36538 62.90% 27
2732 99 6 1 8675 70.50% 14
3000 69 7 0 32014 87.20% 34initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
317 w 10791.65 10791.65 1607.00 1607.00
880 f 2728.91 2728.91 413.46 413.46
1187 w 11048.26 11048.26 1503.29 1503.29
1435 f 22439.72 22439.72 3035.18 3035.18
2732 f 1814.09 1814.09 268.08 268.08
3000 f 13371.84 13371.84 2025.80 2025.80total_rec_prncp total_rec_int total_rec_late_fee recoveries
317 1083.35 523.65 0 0
880 271.09 142.37 0 0
1187 1251.74 251.55 0 0
1435 2560.28 474.90 0 0
2732 185.91 82.17 0 0
3000 1328.16 697.64 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
317 0 Apr-15 407.65 May-15
880 0 Apr-15 103.99 May-15
1187 0 Apr-15 376.93 May-15
1435 0 Apr-15 760.89 May-15
2732 0 Apr-15 67.38 May-15
3000 0 Apr-15 509.51 May-15last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
317 Apr-15 0 43
880 Apr-15 0 51
1187 Apr-15 0 43
1435 Apr-15 0 43
2732 Apr-15 0 43
3000 Apr-15 0 43policy_code
317 1
880 1
1187 1
1435 1
2732 1
3000 1
七、二重抽樣
#第一重抽樣采用簡單隨機抽樣確定層權,第二重抽樣采用分層抽樣
srsp1=srswor(3000,N)
srs1=getdata(LoanStats3c,srsp1) #第一重抽樣
srsp2=srswor(n,3000)
srs2=getdata(srs1,srsp2) #第二重抽樣
print(head(srs2))
ID_unit id member_id loan_amnt funded_amnt funded_amnt_inv
175 175 37641820 40404842 24000 24000 24000
407 407 37771625 40534643 7150 7150 7150
444 444 37661489 40424497 20000 20000 20000
529 529 37821449 40584421 6000 6000 6000
1180 1180 37670980 40433849 10000 10000 10000
1688 1688 37820639 40583405 12000 12000 12000term int_rate installment grade sub_grade
175 36 months 6.03% 730.46 A A1
407 36 months 17.14% 255.42 D D4
444 36 months 11.99% 664.20 B B5
529 36 months 10.49% 194.99 B B3
1180 60 months 14.99% 237.85 C C5
1688 36 months 11.99% 398.52 B B5emp_title emp_length home_ownership annual_inc
175 Captain/Paramedic 10+ years MORTGAGE 120000
407 Customer Care Specialust < 1 year RENT 30000
444 Operations Management < 1 year MORTGAGE 85000
529 Sales Manager 10+ years RENT 100000
1180 Policies & Procedures Administrator 10+ years MORTGAGE 82000
1688 Safety and Security Offider < 1 year MORTGAGE 85000verification_status issue_d loan_status pymnt_plan
175 VERIFIED - income source Dec-14 Current n
407 VERIFIED - income source Dec-14 Current n
444 VERIFIED - income source Dec-14 Fully Paid n
529 not verified Dec-14 Current n
1180 VERIFIED - income Dec-14 Current n
1688 VERIFIED - income source Dec-14 Current nurl desc
175 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37641820
407 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37771625
444 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37661489
529 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37821449
1180 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37670980
1688 https://www.lendingclub.com/browse/loanDetail.action?loan_id=37820639 purpose title zip_code addr_state dti
175 credit_card Credit card refinancing 957xx CA 19.94
407 debt_consolidation Debt consolidation 284xx NC 9.52
444 credit_card Credit card refinancing 597xx MT 11.00
529 debt_consolidation Debt consolidation 917xx CA 9.99
1180 credit_card Credit card refinancing 801xx CO 10.08
1688 debt_consolidation Debt consolidation 063xx CT 7.71delinq_2yrs earliest_cr_line inq_last_6mths mths_since_last_delinq
175 1 Feb-98 0 8
407 0 Oct-11 2 30
444 0 Jul-04 0 30
529 0 Nov-97 1 59
1180 2 Oct-01 2 2
1688 0 Jul-99 0 45mths_since_last_record open_acc pub_rec revol_bal revol_util total_acc
175 69 9 0 32544 46.70% 26
407 69 12 0 7239 55.30% 14
444 69 5 0 19542 97.70% 7
529 69 12 0 5914 32.10% 18
1180 69 21 0 7952 27% 54
1688 69 9 0 5143 57.10% 25initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
175 w 22161.21 22161.21 2179.32 2179.32
407 f 6523.56 6523.56 1011.47 1011.47
444 f 0.00 0.00 20470.31 20470.31
529 f 5422.32 5422.32 774.72 774.72
1180 w 9539.73 9539.73 943.07 943.07
1688 f 10868.71 10868.71 1578.09 1578.09total_rec_prncp total_rec_int total_rec_late_fee recoveries
175 1838.79 340.53 0 0
407 626.44 385.03 0 0
444 20000.00 470.31 0 0
529 577.68 197.04 0 0
1180 460.27 482.80 0 0
1688 1131.29 446.80 0 0collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d
175 0 Apr-15 730.46 May-15
407 0 Apr-15 255.42 May-15
444 0 Feb-15 19826.09
529 0 Apr-15 194.99 May-15
1180 0 Apr-15 237.85 May-15
1688 0 Apr-15 398.52 May-15last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
175 Apr-15 0 43
407 Apr-15 0 43
444 Mar-15 0 43
529 Apr-15 0 59
1180 Apr-15 0 43
1688 Apr-15 0 45policy_code
175 1
407 1
444 1
529 1
1180 1
1688 1