關聯規則算法學習—Apriori
????????Apriori算法是關聯規則挖掘中的經典算法,用于發現數據集中的頻繁項集和強關聯規則。其核心思想基于先驗性質:若一個項集是頻繁的,則其所有子集也一定是頻繁的。該算法通過逐層搜索的迭代方法高效挖掘關聯規則。
要求:
????????理解并掌握關聯規則經典算法Apriori算法,理解算法的原理,能夠實現算法,并對給定的數據集進行關聯規則挖掘
代碼實現:
import pandas as pd
from itertools import combinations
from collections import defaultdict# 讀取數據
data = pd.read_csv('實驗2-Groceries(1).csv')# 預處理數據,將字符串格式的項集轉換為集合
transactions = []
for items in data['items']:# 去除大括號和引號,然后分割items_cleaned = items.strip('{}"').replace('"', '').split(',')transactions.append(set(items_cleaned))print(f"總交易數: {len(transactions)}")
print(f"前5條交易示例: {transactions[:5]}")def get_frequent_itemsets(transactions, min_support):"""實現Apriori算法找出頻繁項集"""# 第一次掃描:計算單個項目的支持度item_counts = defaultdict(int)for transaction in transactions:for item in transaction:item_counts[item] += 1# 篩選滿足最小支持度的單項num_transactions = len(transactions)frequent_items = {}for item, count in item_counts.items():support = count / num_transactionsif support >= min_support:frequent_items[frozenset([item])] = supportcurrent_frequent = frequent_itemsfrequent_itemsets = {}k = 1while current_frequent:frequent_itemsets.update(current_frequent)# 生成候選項集next_candidates = set()items = [item for itemset in current_frequent.keys() for item in itemset]unique_items = list(set(items))# 生成k+1大小的候選項集if k == 1:# 對于k=1,直接兩兩組合for i in range(len(unique_items)):for j in range(i+1, len(unique_items)):next_candidates.add(frozenset([unique_items[i], unique_items[j]]))else:# 對于k>1,使用先驗性質for itemset1 in current_frequent:for itemset2 in current_frequent:union_set = itemset1.union(itemset2)if len(union_set) == k + 1:next_candidates.add(union_set)# 第二次掃描:計算候選項集的支持度candidate_counts = defaultdict(int)for transaction in transactions:for candidate in next_candidates:if candidate.issubset(transaction):candidate_counts[candidate] += 1# 篩選滿足最小支持度的項集current_frequent = {}for itemset, count in candidate_counts.items():support = count / num_transactionsif support >= min_support:current_frequent[itemset] = supportk += 1return frequent_itemsetsdef generate_association_rules(frequent_itemsets, min_confidence):"""生成關聯規則"""rules = []for itemset in frequent_itemsets.keys():if len(itemset) < 2:continuesupport_itemset = frequent_itemsets[itemset]# 生成所有可能的非空子集all_subsets = []for i in range(1, len(itemset)):all_subsets.extend(combinations(itemset, i))for subset in all_subsets:subset = frozenset(subset)remaining = itemset - subsetif remaining:support_subset = frequent_itemsets.get(subset, 0)if support_subset > 0:confidence = support_itemset / support_subsetif confidence >= min_confidence:rules.append((subset, remaining, support_itemset, confidence))return rules# 設置支持度和置信度閾值
min_support = 0.05 # 5%的支持度
min_confidence = 0.3 # 30%的置信度# 找出頻繁項集
frequent_itemsets = get_frequent_itemsets(transactions, min_support)# 生成關聯規則
rules = generate_association_rules(frequent_itemsets, min_confidence)# 按支持度排序
sorted_rules = sorted(rules, key=lambda x: x[2], reverse=True)# 打印頻繁項集
print("\n頻繁項集 (支持度 ≥ {}):".format(min_support))
for itemset, support in frequent_itemsets.items():if len(itemset) >= 2: # 只顯示多項集print(f"{set(itemset)}: {support:.3f}")# 打印關聯規則
print("\n關聯規則 (置信度 ≥ {}):".format(min_confidence))
for rule in sorted_rules[:20]: # 顯示前20條規則antecedent, consequent, support, confidence = ruleprint(f"{set(antecedent)} => {set(consequent)} (支持度: {support:.3f}, 置信度: {confidence:.3f})")# 嘗試不同的支持度和置信度
parameters = [(0.05, 0.3), # 原始參數(0.03, 0.4), # 更低支持度,更高置信度(0.08, 0.25) # 更高支持度,更低置信度
]for sup, conf in parameters:print(f"\n參數: 最小支持度={sup}, 最小置信度={conf}")freq_itemsets = get_frequent_itemsets(transactions, sup)rules = generate_association_rules(freq_itemsets, conf)print(f"頻繁項集數量: {len(freq_itemsets)}")print(f"關聯規則數量: {len(rules)}")if rules:# 顯示支持度最高的規則top_rule = max(rules, key=lambda x: x[2])print("支持度最高的規則:")print(f"{set(top_rule[0])} => {set(top_rule[1])} (支持度: {top_rule[2]:.3f}, 置信度: {top_rule[3]:.3f})")