超詳細解說 sklearn 中的聯合特征(FeatureUnion)
1. FeatureUnion 簡介
FeatureUnion
是 scikit-learn 中的一個工具,用于并行地組合多個特征提取器的輸出。它允許你將不同的特征提取方法(如文本向量化、數值特征縮放、自定義特征工程等)的結果**橫向拼接(concatenate)**成一個更大的特征矩陣。
核心思想:
- 并行處理:每個特征提取器獨立處理原始數據。
- 特征拼接:將所有提取器的輸出在特征維度(列方向)上拼接。
- 統一接口:對外提供與單個轉換器相同的
fit
、transform
、fit_transform
接口。
適用場景:
- 同時使用多種特征提取方法(如 TF-IDF + 詞袋模型 + 自定義統計特征)。
- 處理異構數據(如文本 + 數值 + 分類特征)。
- 構建復雜特征工程流水線。
2. FeatureUnion 基本語法
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA# 創建多個特征提取器
tfidf = TfidfVectorizer()
bow = CountVectorizer()
scaler = StandardScaler()# 使用 FeatureUnion 組合
combined_features = FeatureUnion([('tfidf', tfidf),('bow', bow),# ('scaler', scaler) # 注意:scaler 通常用于數值特征,不能直接用于文本
])# 使用方法
X_transformed = combined_features.fit_transform(X)
!注意:所有特征提取器必須能處理相同的輸入數據格式(如都是文本或都是數值數組)。
3.完整代碼示例
示例1:文本特征組合(TF-IDF+詞袋模型)
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report# 加載數據
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X, y = newsgroups.data, newsgroups.target# 劃分訓練測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# 創建 FeatureUnion
feature_union = FeatureUnion([('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),('bow', CountVectorizer(max_features=500, ngram_range=(1, 2), stop_words='english'))
])# 訓練特征提取器
X_train_features = feature_union.fit_transform(X_train)
X_test_features = feature_union.transform(X_test)print(f"訓練集特征維度: {X_train_features.shape}") # (樣本數, 1500) = 1000(TF-IDF) + 500(BOW)
print(f"測試集特征維度: {X_test_features.shape}")# 訓練分類器
clf = LogisticRegression(random_state=42)
clf.fit(X_train_features, y_train)# 預測
y_pred = clf.predict(X_test_features)
print("\n分類報告:")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))
示例2:數值特征組合(PCA+原始特征)
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np# 加載數據
iris = load_iris()
X, y = iris.data, iris.target# 劃分數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)# 創建 FeatureUnion
feature_union = FeatureUnion([('pca', PCA(n_components=2)), # 降維到2個主成分('scaler', StandardScaler()) # 標準化原始特征
])# 訓練并轉換
X_train_combined = feature_union.fit_transform(X_train)
X_test_combined = feature_union.transform(X_test)print(f"原始特征維度: {X_train.shape[1]}") # 4
print(f"組合后特征維度: {X_train_combined.shape[1]}") # 6 = 2(PCA) + 4(標準化)# 訓練分類器
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_combined, y_train)# 評估
accuracy = clf.score(X_test_combined, y_test)
print(f"\n測試集準確率: {accuracy:.4f}")
示例3:自定義特征提取器+內置提取器
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import reclass TextStatsExtractor(BaseEstimator, TransformerMixin):"""自定義文本統計特征提取器"""def fit(self, X, y=None):return selfdef transform(self, X):# 提取文本長度、單詞數、大寫字母比例等統計特征features = []for text in X:length = len(text)word_count = len(text.split())uppercase_ratio = sum(1 for c in text if c.isupper()) / (len(text) + 1e-8)exclamation_count = text.count('!')question_count = text.count('?')features.append([length, word_count, uppercase_ratio, exclamation_count, question_count])return np.array(features)# 示例數據
texts = ["This is a GREAT product!!!","I hate this item... it's terrible.","Average quality, nothing special.","AMAZING!!! Best purchase ever!!!","Not bad, but could be better?"
]# 創建 FeatureUnion
feature_union = FeatureUnion([('tfidf', TfidfVectorizer(max_features=50, stop_words='english')),('stats', TextStatsExtractor())
])# 轉換數據
X_combined = feature_union.fit_transform(texts)print("特征名稱:")
feature_names = []
# TF-IDF 特征名
tfidf_names = feature_union.transformer_list[0][1].get_feature_names_out()
feature_names.extend([f"tfidf_{name}" for name in tfidf_names])
# 統計特征名
stat_names = ['length', 'word_count', 'uppercase_ratio', 'exclamation_count', 'question_count']
feature_names.extend(stat_names)print(f"總特征數: {X_combined.shape[1]}")
print("前5個特征名:", feature_names[:5])
print("后5個特征名:", feature_names[-5:])print("\n轉換后的特征矩陣:")
print(X_combined.toarray())
4.高級用法
4.1設置權重(transformer_weights)
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer# 為不同特征提取器設置權重
feature_union = FeatureUnion([('tfidf', TfidfVectorizer(max_features=100)),('bow', CountVectorizer(max_features=100))
], transformer_weights={'tfidf': 1.0,'bow': 0.5 # 詞袋模型的特征乘以0.5
})# 權重會在 transform 后應用
X_weighted = feature_union.fit_transform(texts)
4.2與Pipeline結合使用
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.datasets import fetch_20newsgroups# 創建復雜的流水線
pipeline = Pipeline([('features', FeatureUnion([('tfidf', TfidfVectorizer(max_features=1000)),('stats', TextStatsExtractor()) # 假設已定義])),('scaler', StandardScaler(with_mean=False)), # 稀疏矩陣不能用 with_mean=True('classifier', SVC(kernel='linear'))
])# 使用流水線
newsgroups = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'soc.religion.christian'])
pipeline.fit(newsgroups.data, newsgroups.target)
4.3使用make_union快捷方式
from sklearn.pipeline import make_union
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest# 簡化語法
union = make_union(PCA(n_components=2),SelectKBest(k=3),n_jobs=1 # 并行處理的作業數
)
5.重要注意事項
5.1輸入數據一致性
# × 錯誤:混合不同類型的數據處理
feature_union = FeatureUnion([('tfidf', TfidfVectorizer()), #處理文本('scaler', StandardScaler()) #處理數值-會報錯!
])
5.2稀疏矩陣處理
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
# TF-IDF 產生稀疏矩陣
tfidf = TfidfVectorizer()
X_sparse = tfidf.fit_transform(texts)
# StandardScaler 不能直接處理稀疏矩陣
# scalar = StandardScaler() # 會報錯!
# 解決方案:使用 with_mean=False
scaler=StandardScaler(with_mean=False) # 可以處理稀疏矩陣
5.3 并行處理 (n_jobs)
feature_union = FeatureUnion([('tfidf1', TfidfVectorizer(max_features=1000)),('tfidf2', TfidfVectorizer(max_features=1000, ngram_range=(2,2)))
], n_jobs=-1) # 使用所有CPU核心并行處理
6.調試與檢查
6.1查看各組件輸出維度
feature_union = FeatureUnion([('tfidf', TfidfVectorizer(max_features=500)),('bow', CountVectorizer(max_features=300))
])X_combined = feature_union.fit_transform(texts)# 檢查每個組件的輸出
for name, transformer in feature_union.transformer_list:X_part = transformer.transform(texts)print(f"{name}: {X_part.shape}")print(f"Combined: {X_combined.shape}")
6.2 獲取特征名稱
def get_feature_names(feature_union):"""獲取 FeatureUnion 的所有特征名稱"""feature_names = []for name, transformer in feature_union.transformer_list:if hasattr(transformer, 'get_feature_names_out'):names = transformer.get_feature_names_out()elif hasattr(transformer, 'get_feature_names'):names = transformer.get_feature_names()else:# 對于自定義轉換器,可能需要手動定義n_features = transformer.transform([texts[0]]).shape[1]names = [f"{name}_feature_{i}" for i in range(n_features)]feature_names.extend([f"{name}__{n}" for n in names])return feature_names# 使用示例
feature_names = get_feature_names(feature_union)
print(f"總特征數: {len(feature_names)}")
print("前10個特征名:", feature_names[:10])
7. 替代方案:ColumnTransformer
對于異構數據(不同列不同類型),推薦使用 ColumnTransformer:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd# 創建示例數據框
df = pd.DataFrame({'text': ['good product', 'bad service', 'excellent quality'],'price': [100, 200, 150],'category': ['A', 'B', 'A']
})# 使用 ColumnTransformer 處理不同列
preprocessor = ColumnTransformer([('text', TfidfVectorizer(), 'text'),('num', StandardScaler(), ['price']),('cat', OneHotEncoder(), ['category'])
])X_transformed = preprocessor.fit_transform(df)
print(f"轉換后形狀: {X_transformed.shape}")
8. 總結
FeatureUnion 的優勢:
- ? 簡化多特征提取器的組合
- ? 提供統一的接口
- ? 支持并行處理
- ? 可與 Pipeline 無縫集成
使用建議:
- 當需要組合同類型數據的多種特征提取方法時使用
- 對于異構數據,優先考慮 ColumnTransformer
- 注意稀疏矩陣的處理
- 合理設置特征權重
- 使用 make_union 簡化代碼
最佳實踐:
# 推薦的完整流程
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score# 1. 定義特征組合
features = FeatureUnion([('tfidf', TfidfVectorizer(max_features=1000)),('custom', CustomFeatureExtractor())
])# 2. 創建完整流水線
pipeline = Pipeline([('features', features),('classifier', LogisticRegression())
])# 3. 交叉驗證評估
scores = cross_val_score(pipeline, X, y, cv=5)
print(f"平均準確率: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
通過合理使用 FeatureUnion
,你可以構建強大的特征工程系統,顯著提升機器學習模型的性能!