醋醋百科网

Good Luck To You!

Python教程(四十五):推荐系统-个性化推荐算法

今日目标

o 理解推荐系统的基本概念和类型

o 掌握协同过滤算法(用户和物品)

o 学会基于内容的推荐方法

o 了解矩阵分解和深度学习推荐

o 掌握推荐系统评估和优化技术

推荐系统概述

推荐系统是信息过滤系统,用于预测用户对物品的偏好:

o 协同过滤:基于用户或物品的相似性

o 基于内容:基于物品特征和用户偏好

o 混合推荐:结合多种推荐方法

o 深度学习:使用神经网络进行推荐

推荐系统应用领域

# 主要应用领域:
# - 电商平台:商品推荐
# - 视频网站:内容推荐
# - 音乐平台:歌曲推荐
# - 社交媒体:好友推荐
# - 新闻网站:文章推荐
# - 游戏平台:游戏推荐

推荐系统基础

1. 安装和导入

pip install pandas numpy matplotlib seaborn scikit-learn scipy surprise
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 设置Seaborn样式
sns.set_style("whitegrid")

print("推荐系统环境设置完成")

2. 创建示例数据

def create_recommendation_data():
    """创建推荐系统示例数据"""
    
    np.random.seed(42)
    
    print("=== 创建推荐系统数据 ===")
    
    # 1. 用户-物品评分矩阵
    n_users = 100
    n_items = 50
    
    # 生成稀疏评分矩阵
    ratings_data = []
    for user_id in range(n_users):
        # 每个用户随机评价10-20个物品
        n_ratings = np.random.randint(10, 21)
        item_ids = np.random.choice(n_items, n_ratings, replace=False)
        
        for item_id in item_ids:
            # 生成1-5的评分
            rating = np.random.randint(1, 6)
            ratings_data.append({
                'user_id': user_id,
                'item_id': item_id,
                'rating': rating
            })
    
    ratings_df = pd.DataFrame(ratings_data)
    
    # 2. 物品特征数据
    categories = ['电影', '音乐', '书籍', '游戏', '电子产品']
    genres = ['动作', '喜剧', '科幻', '恐怖', '爱情', '纪录片', '动画']
    
    items_data = []
    for item_id in range(n_items):
        category = np.random.choice(categories)
        genre = np.random.choice(genres)
        price = np.random.randint(10, 1000)
        
        items_data.append({
            'item_id': item_id,
            'name': f'物品{item_id}',
            'category': category,
            'genre': genre,
            'price': price,
            'description': f'这是一个{category}类别的{genre}作品,价格{price}元'
        })
    
    items_df = pd.DataFrame(items_data)
    
    # 3. 用户特征数据
    ages = np.random.randint(18, 65, n_users)
    genders = np.random.choice(['男', '女'], n_users)
    
    users_data = []
    for user_id in range(n_users):
        users_data.append({
            'user_id': user_id,
            'age': ages[user_id],
            'gender': genders[user_id],
            'preference': np.random.choice(categories)
        })
    
    users_df = pd.DataFrame(users_data)
    
    print(f"用户数量: {n_users}")
    print(f"物品数量: {n_items}")
    print(f"评分数量: {len(ratings_df)}")
    print(f"稀疏度: {len(ratings_df) / (n_users * n_items) * 100:.2f}%")
    
    # 4. 创建评分矩阵
    rating_matrix = ratings_df.pivot(index='user_id', columns='item_id', values='rating')
    rating_matrix = rating_matrix.fillna(0)
    
    # 可视化数据分布
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 评分分布
    axes[0, 0].hist(ratings_df['rating'], bins=5, alpha=0.7, edgecolor='black')
    axes[0, 0].set_title('评分分布')
    axes[0, 0].set_xlabel('评分')
    axes[0, 0].set_ylabel('频次')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 用户评分数量分布
    user_rating_counts = ratings_df.groupby('user_id').size()
    axes[0, 1].hist(user_rating_counts, bins=20, alpha=0.7, edgecolor='black')
    axes[0, 1].set_title('用户评分数量分布')
    axes[0, 1].set_xlabel('评分数量')
    axes[0, 1].set_ylabel('用户数量')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 物品评分数量分布
    item_rating_counts = ratings_df.groupby('item_id').size()
    axes[1, 0].hist(item_rating_counts, bins=20, alpha=0.7, edgecolor='black')
    axes[1, 0].set_title('物品评分数量分布')
    axes[1, 0].set_xlabel('评分数量')
    axes[1, 0].set_ylabel('物品数量')
    axes[1, 0].grid(True, alpha=0.3)
    
    # 类别分布
    category_counts = items_df['category'].value_counts()
    axes[1, 1].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
    axes[1, 1].set_title('物品类别分布')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'ratings_df': ratings_df,
        'items_df': items_df,
        'users_df': users_df,
        'rating_matrix': rating_matrix
    }

# 运行数据创建示例
rec_data = create_recommendation_data()

协同过滤算法

1. 基于用户的协同过滤

def user_based_collaborative_filtering():
    """基于用户的协同过滤示例"""
    
    rating_matrix = rec_data['rating_matrix']
    
    print("=== 基于用户的协同过滤 ===")
    
    # 1. 计算用户相似度矩阵
    user_similarity = cosine_similarity(rating_matrix)
    user_similarity_df = pd.DataFrame(user_similarity, 
                                     index=rating_matrix.index, 
                                     columns=rating_matrix.index)
    
    print(f"用户相似度矩阵形状: {user_similarity_df.shape}")
    
    # 2. 为用户推荐物品
    def recommend_for_user(user_id, n_recommendations=5):
        """为用户推荐物品"""
        if user_id not in rating_matrix.index:
            return []
        
        # 获取用户的评分
        user_ratings = rating_matrix.loc[user_id]
        
        # 找到用户未评分的物品
        unrated_items = user_ratings[user_ratings == 0].index
        
        if len(unrated_items) == 0:
            return []
        
        # 获取相似用户
        similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False)[1:11]
        
        # 计算预测评分
        predictions = {}
        for item_id in unrated_items:
            numerator = 0
            denominator = 0
            
            for similar_user_id, similarity in similar_users.items():
                if similarity > 0:  # 只考虑正相似度
                    similar_user_rating = rating_matrix.loc[similar_user_id, item_id]
                    if similar_user_rating > 0:  # 只考虑有评分的用户
                        numerator += similarity * similar_user_rating
                        denominator += similarity
            
            if denominator > 0:
                predictions[item_id] = numerator / denominator
        
        # 返回推荐结果
        recommendations = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return recommendations[:n_recommendations]
    
    # 3. 为示例用户推荐
    example_user = 0
    recommendations = recommend_for_user(example_user, n_recommendations=10)
    
    print(f"\n为用户{example_user}的推荐结果:")
    for item_id, predicted_rating in recommendations:
        item_name = rec_data['items_df'].loc[item_id, 'name']
        category = rec_data['items_df'].loc[item_id, 'category']
        print(f"物品{item_id} ({item_name}, {category}): 预测评分 {predicted_rating:.2f}")
    
    # 4. 可视化用户相似度
    plt.figure(figsize=(10, 8))
    sns.heatmap(user_similarity_df.iloc[:20, :20], cmap='coolwarm', center=0)
    plt.title('用户相似度热力图 (前20个用户)')
    plt.xlabel('用户ID')
    plt.ylabel('用户ID')
    plt.show()
    
    # 5. 评估推荐质量
    def evaluate_recommendations(user_id, n_recommendations=5):
        """评估推荐质量"""
        # 获取用户的实际评分
        user_ratings = rating_matrix.loc[user_id]
        rated_items = user_ratings[user_ratings > 0]
        
        if len(rated_items) < 2:
            return None
        
        # 随机隐藏一些评分作为测试集
        test_size = min(len(rated_items) // 2, 5)
        test_items = np.random.choice(rated_items.index, test_size, replace=False)
        
        # 临时隐藏测试评分
        original_ratings = {}
        for item_id in test_items:
            original_ratings[item_id] = rating_matrix.loc[user_id, item_id]
            rating_matrix.loc[user_id, item_id] = 0
        
        # 重新计算相似度(简化处理)
        user_similarity_temp = cosine_similarity(rating_matrix)
        
        # 获取推荐
        recommendations = recommend_for_user(user_id, n_recommendations)
        recommended_items = [item_id for item_id, _ in recommendations]
        
        # 恢复原始评分
        for item_id, rating in original_ratings.items():
            rating_matrix.loc[user_id, item_id] = rating
        
        # 计算指标
        hits = len(set(recommended_items) & set(test_items))
        precision = hits / len(recommended_items) if len(recommended_items) > 0 else 0
        recall = hits / len(test_items) if len(test_items) > 0 else 0
        
        return precision, recall
    
    # 评估多个用户
    evaluation_results = []
    for user_id in range(min(20, len(rating_matrix))):
        result = evaluate_recommendations(user_id)
        if result is not None:
            evaluation_results.append(result)
    
    if evaluation_results:
        avg_precision = np.mean([r[0] for r in evaluation_results])
        avg_recall = np.mean([r[1] for r in evaluation_results])
        
        print(f"\n推荐质量评估 (基于{len(evaluation_results)}个用户):")
        print(f"平均精确率: {avg_precision:.3f}")
        print(f"平均召回率: {avg_recall:.3f}")
    
    return {
        'user_similarity': user_similarity_df,
        'recommendations': recommendations,
        'evaluation_results': evaluation_results
    }

# 运行基于用户的协同过滤示例
user_cf_results = user_based_collaborative_filtering()

2. 基于物品的协同过滤

def item_based_collaborative_filtering():
    """基于物品的协同过滤示例"""
    
    rating_matrix = rec_data['rating_matrix']
    
    print("=== 基于物品的协同过滤 ===")
    
    # 1. 计算物品相似度矩阵
    item_similarity = cosine_similarity(rating_matrix.T)
    item_similarity_df = pd.DataFrame(item_similarity, 
                                     index=rating_matrix.columns, 
                                     columns=rating_matrix.columns)
    
    print(f"物品相似度矩阵形状: {item_similarity_df.shape}")
    
    # 2. 为用户推荐物品
    def recommend_for_user_item_based(user_id, n_recommendations=5):
        """基于物品的协同过滤推荐"""
        if user_id not in rating_matrix.index:
            return []
        
        # 获取用户的评分
        user_ratings = rating_matrix.loc[user_id]
        
        # 找到用户未评分的物品
        unrated_items = user_ratings[user_ratings == 0].index
        
        if len(unrated_items) == 0:
            return []
        
        # 计算预测评分
        predictions = {}
        for item_id in unrated_items:
            numerator = 0
            denominator = 0
            
            # 获取用户已评分的物品
            rated_items = user_ratings[user_ratings > 0].index
            
            for rated_item_id in rated_items:
                similarity = item_similarity_df.loc[item_id, rated_item_id]
                if similarity > 0:  # 只考虑正相似度
                    rating = user_ratings[rated_item_id]
                    numerator += similarity * rating
                    denominator += similarity
            
            if denominator > 0:
                predictions[item_id] = numerator / denominator
        
        # 返回推荐结果
        recommendations = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return recommendations[:n_recommendations]
    
    # 3. 为示例用户推荐
    example_user = 0
    recommendations = recommend_for_user_item_based(example_user, n_recommendations=10)
    
    print(f"\n基于物品的协同过滤为用户{example_user}的推荐结果:")
    for item_id, predicted_rating in recommendations:
        item_name = rec_data['items_df'].loc[item_id, 'name']
        category = rec_data['items_df'].loc[item_id, 'category']
        print(f"物品{item_id} ({item_name}, {category}): 预测评分 {predicted_rating:.2f}")
    
    # 4. 可视化物品相似度
    plt.figure(figsize=(10, 8))
    sns.heatmap(item_similarity_df.iloc[:20, :20], cmap='coolwarm', center=0)
    plt.title('物品相似度热力图 (前20个物品)')
    plt.xlabel('物品ID')
    plt.ylabel('物品ID')
    plt.show()
    
    # 5. 找到最相似的物品
    def find_similar_items(item_id, n_similar=5):
        """找到最相似的物品"""
        if item_id not in item_similarity_df.index:
            return []
        
        similar_items = item_similarity_df.loc[item_id].sort_values(ascending=False)[1:n_similar+1]
        return similar_items
    
    example_item = 0
    similar_items = find_similar_items(example_item, n_similar=5)
    
    print(f"\n与物品{example_item}最相似的物品:")
    for item_id, similarity in similar_items.items():
        item_name = rec_data['items_df'].loc[item_id, 'name']
        category = rec_data['items_df'].loc[item_id, 'category']
        print(f"物品{item_id} ({item_name}, {category}): 相似度 {similarity:.3f}")
    
    return {
        'item_similarity': item_similarity_df,
        'recommendations': recommendations,
        'similar_items': similar_items
    }

# 运行基于物品的协同过滤示例
item_cf_results = item_based_collaborative_filtering()

基于内容的推荐

1. 内容特征提取

def content_based_recommendation():
    """基于内容的推荐示例"""
    
    items_df = rec_data['items_df']
    users_df = rec_data['users_df']
    ratings_df = rec_data['ratings_df']
    
    print("=== 基于内容的推荐 ===")
    
    # 1. 提取物品特征
    # 使用TF-IDF提取文本特征
    tfidf = TfidfVectorizer(max_features=100, stop_words=None)
    item_features = tfidf.fit_transform(items_df['description'])
    
    print(f"物品特征矩阵形状: {item_features.shape}")
    
    # 2. 计算物品相似度
    item_content_similarity = cosine_similarity(item_features)
    item_content_similarity_df = pd.DataFrame(item_content_similarity,
                                             index=items_df.index,
                                             columns=items_df.index)
    
    # 3. 构建用户偏好模型
    def build_user_profile(user_id):
        """构建用户偏好模型"""
        # 获取用户评分的物品
        user_ratings = ratings_df[ratings_df['user_id'] == user_id]
        
        if len(user_ratings) == 0:
            return None
        
        # 计算用户偏好向量
        user_profile = np.zeros(item_features.shape[1])
        
        for _, row in user_ratings.iterrows():
            item_id = row['item_id']
            rating = row['rating']
            
            # 加权物品特征
            item_feature = item_features[item_id].toarray().flatten()
            user_profile += rating * item_feature
        
        # 归一化
        if np.sum(user_profile) > 0:
            user_profile = user_profile / np.sum(user_profile)
        
        return user_profile
    
    # 4. 基于内容的推荐
    def recommend_content_based(user_id, n_recommendations=5):
        """基于内容的推荐"""
        # 构建用户偏好模型
        user_profile = build_user_profile(user_id)
        
        if user_profile is None:
            return []
        
        # 获取用户已评分的物品
        user_rated_items = ratings_df[ratings_df['user_id'] == user_id]['item_id'].tolist()
        
        # 计算用户对未评分物品的偏好分数
        predictions = {}
        for item_id in range(len(items_df)):
            if item_id not in user_rated_items:
                item_feature = item_features[item_id].toarray().flatten()
                score = np.dot(user_profile, item_feature)
                predictions[item_id] = score
        
        # 返回推荐结果
        recommendations = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return recommendations[:n_recommendations]
    
    # 5. 为示例用户推荐
    example_user = 0
    recommendations = recommend_content_based(example_user, n_recommendations=10)
    
    print(f"\n基于内容的推荐为用户{example_user}的推荐结果:")
    for item_id, score in recommendations:
        item_name = items_df.loc[item_id, 'name']
        category = items_df.loc[item_id, 'category']
        description = items_df.loc[item_id, 'description']
        print(f"物品{item_id} ({item_name}, {category}): 偏好分数 {score:.3f}")
        print(f"  描述: {description}")
    
    # 6. 可视化物品特征
    # 使用PCA降维可视化
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=2)
    item_features_2d = pca.fit_transform(item_features.toarray())
    
    plt.figure(figsize=(12, 8))
    
    # 按类别着色
    categories = items_df['category'].unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(categories)))
    
    for i, category in enumerate(categories):
        mask = items_df['category'] == category
        plt.scatter(item_features_2d[mask, 0], item_features_2d[mask, 1], 
                   c=[colors[i]], label=category, alpha=0.7, s=50)
    
    plt.title('物品特征空间可视化 (PCA降维)')
    plt.xlabel('主成分1')
    plt.ylabel('主成分2')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        'item_features': item_features,
        'item_content_similarity': item_content_similarity_df,
        'recommendations': recommendations,
        'pca_features': item_features_2d
    }

# 运行基于内容的推荐示例
content_based_results = content_based_recommendation()

矩阵分解方法

1. 非负矩阵分解 (NMF)

def matrix_factorization():
    """矩阵分解方法示例"""
    
    rating_matrix = rec_data['rating_matrix']
    
    print("=== 矩阵分解方法 ===")
    
    # 1. 非负矩阵分解 (NMF)
    # 将评分矩阵转换为非负矩阵(添加偏移)
    rating_matrix_shifted = rating_matrix + 1  # 将0-4评分转换为1-5
    
    # 应用NMF
    n_components = 10
    nmf = NMF(n_components=n_components, random_state=42, max_iter=200)
    
    # 只对非零元素进行分解
    rating_matrix_sparse = csr_matrix(rating_matrix_shifted.values)
    
    # 使用NMF分解
    user_factors = nmf.fit_transform(rating_matrix_sparse)
    item_factors = nmf.components_
    
    print(f"用户因子矩阵形状: {user_factors.shape}")
    print(f"物品因子矩阵形状: {item_factors.shape}")
    print(f"NMF重构误差: {nmf.reconstruction_err_:.4f}")
    
    # 2. 奇异值分解 (SVD)
    # 使用TruncatedSVD进行SVD分解
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    user_factors_svd = svd.fit_transform(rating_matrix_sparse)
    item_factors_svd = svd.components_
    
    print(f"SVD解释方差比例: {svd.explained_variance_ratio_.sum():.4f}")
    
    # 3. 基于矩阵分解的推荐
    def recommend_matrix_factorization(user_id, user_factors, item_factors, n_recommendations=5):
        """基于矩阵分解的推荐"""
        if user_id >= user_factors.shape[0]:
            return []
        
        # 获取用户因子
        user_factor = user_factors[user_id]
        
        # 计算预测评分
        predictions = user_factor @ item_factors
        
        # 获取用户已评分的物品
        user_ratings = rating_matrix.iloc[user_id]
        rated_items = user_ratings[user_ratings > 0].index
        
        # 过滤已评分的物品
        unrated_predictions = []
        for item_id in range(len(predictions)):
            if item_id not in rated_items:
                unrated_predictions.append((item_id, predictions[item_id]))
        
        # 返回推荐结果
        recommendations = sorted(unrated_predictions, key=lambda x: x[1], reverse=True)
        return recommendations[:n_recommendations]
    
    # 4. 比较不同方法的推荐结果
    example_user = 0
    
    # NMF推荐
    nmf_recommendations = recommend_matrix_factorization(example_user, user_factors, item_factors, 5)
    
    # SVD推荐
    svd_recommendations = recommend_matrix_factorization(example_user, user_factors_svd, item_factors_svd, 5)
    
    print(f"\n矩阵分解推荐结果 (用户{example_user}):")
    print("NMF推荐:")
    for item_id, score in nmf_recommendations:
        item_name = rec_data['items_df'].loc[item_id, 'name']
        category = rec_data['items_df'].loc[item_id, 'category']
        print(f"  物品{item_id} ({item_name}, {category}): 预测评分 {score:.3f}")
    
    print("\nSVD推荐:")
    for item_id, score in svd_recommendations:
        item_name = rec_data['items_df'].loc[item_id, 'name']
        category = rec_data['items_df'].loc[item_id, 'category']
        print(f"  物品{item_id} ({item_name}, {category}): 预测评分 {score:.3f}")
    
    # 5. 可视化因子矩阵
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # NMF用户因子热力图
    sns.heatmap(user_factors[:20, :10], ax=axes[0, 0], cmap='viridis')
    axes[0, 0].set_title('NMF用户因子矩阵 (前20用户, 前10因子)')
    axes[0, 0].set_xlabel('因子')
    axes[0, 0].set_ylabel('用户')
    
    # NMF物品因子热力图
    sns.heatmap(item_factors[:10, :20], ax=axes[0, 1], cmap='viridis')
    axes[0, 1].set_title('NMF物品因子矩阵 (前10因子, 前20物品)')
    axes[0, 1].set_xlabel('物品')
    axes[0, 1].set_ylabel('因子')
    
    # SVD用户因子热力图
    sns.heatmap(user_factors_svd[:20, :10], ax=axes[1, 0], cmap='coolwarm', center=0)
    axes[1, 0].set_title('SVD用户因子矩阵 (前20用户, 前10因子)')
    axes[1, 0].set_xlabel('因子')
    axes[1, 0].set_ylabel('用户')
    
    # SVD物品因子热力图
    sns.heatmap(item_factors_svd[:10, :20], ax=axes[1, 1], cmap='coolwarm', center=0)
    axes[1, 1].set_title('SVD物品因子矩阵 (前10因子, 前20物品)')
    axes[1, 1].set_xlabel('物品')
    axes[1, 1].set_ylabel('因子')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'user_factors_nmf': user_factors,
        'item_factors_nmf': item_factors,
        'user_factors_svd': user_factors_svd,
        'item_factors_svd': item_factors_svd,
        'nmf_recommendations': nmf_recommendations,
        'svd_recommendations': svd_recommendations
    }

# 运行矩阵分解示例
matrix_factorization_results = matrix_factorization()

推荐系统评估

1. 评估指标

def recommendation_evaluation():
    """推荐系统评估示例"""
    
    ratings_df = rec_data['ratings_df']
    rating_matrix = rec_data['rating_matrix']
    
    print("=== 推荐系统评估 ===")
    
    # 1. 数据分割
    def split_data(ratings_df, test_ratio=0.2):
        """分割训练集和测试集"""
        np.random.seed(42)
        
        # 随机选择测试集
        test_indices = np.random.choice(len(ratings_df), 
                                       size=int(len(ratings_df) * test_ratio), 
                                       replace=False)
        
        test_data = ratings_df.iloc[test_indices].copy()
        train_data = ratings_df.drop(test_indices).copy()
        
        return train_data, test_data
    
    train_data, test_data = split_data(ratings_df)
    
    print(f"训练集大小: {len(train_data)}")
    print(f"测试集大小: {len(test_data)}")
    
    # 2. 重新构建训练矩阵
    train_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating')
    train_matrix = train_matrix.fillna(0)
    
    # 3. 评估函数
    def evaluate_recommendations(train_matrix, test_data, recommend_func, n_recommendations=10):
        """评估推荐函数"""
        precisions = []
        recalls = []
        ndcgs = []
        
        # 为每个测试用户评估
        test_users = test_data['user_id'].unique()
        
        for user_id in test_users[:50]:  # 限制用户数量以加快计算
            if user_id not in train_matrix.index:
                continue
            
            # 获取用户的实际测试评分
            user_test_items = test_data[test_data['user_id'] == user_id]
            actual_items = set(user_test_items['item_id'].tolist())
            
            if len(actual_items) == 0:
                continue
            
            # 获取推荐结果
            try:
                recommendations = recommend_func(user_id, n_recommendations)
                recommended_items = set([item_id for item_id, _ in recommendations])
                
                # 计算精确率和召回率
                hits = len(actual_items & recommended_items)
                precision = hits / len(recommended_items) if len(recommended_items) > 0 else 0
                recall = hits / len(actual_items) if len(actual_items) > 0 else 0
                
                precisions.append(precision)
                recalls.append(recall)
                
                # 计算NDCG (简化版本)
                dcg = 0
                idcg = 0
                
                for i, (item_id, _) in enumerate(recommendations):
                    if item_id in actual_items:
                        dcg += 1 / np.log2(i + 2)
                
                for i in range(min(len(actual_items), len(recommendations))):
                    idcg += 1 / np.log2(i + 2)
                
                ndcg = dcg / idcg if idcg > 0 else 0
                ndcgs.append(ndcg)
                
            except:
                continue
        
        return {
            'precision': np.mean(precisions),
            'recall': np.mean(recalls),
            'ndcg': np.mean(ndcgs),
            'coverage': len(precisions)
        }
    
    # 4. 定义推荐函数
    def user_cf_recommend(user_id, n_recommendations=10):
        """基于用户的协同过滤推荐"""
        if user_id not in train_matrix.index:
            return []
        
        # 计算用户相似度
        user_similarity = cosine_similarity(train_matrix)
        user_similarity_df = pd.DataFrame(user_similarity, 
                                         index=train_matrix.index, 
                                         columns=train_matrix.index)
        
        # 获取用户评分
        user_ratings = train_matrix.loc[user_id]
        unrated_items = user_ratings[user_ratings == 0].index
        
        if len(unrated_items) == 0:
            return []
        
        # 计算预测评分
        predictions = {}
        similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False)[1:11]
        
        for item_id in unrated_items:
            numerator = 0
            denominator = 0
            
            for similar_user_id, similarity in similar_users.items():
                if similarity > 0:
                    similar_user_rating = train_matrix.loc[similar_user_id, item_id]
                    if similar_user_rating > 0:
                        numerator += similarity * similar_user_rating
                        denominator += similarity
            
            if denominator > 0:
                predictions[item_id] = numerator / denominator
        
        recommendations = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return recommendations[:n_recommendations]
    
    # 5. 评估不同方法
    print("评估推荐系统性能...")
    
    # 基于用户的协同过滤
    user_cf_results = evaluate_recommendations(train_matrix, test_data, user_cf_recommend)
    
    print(f"\n基于用户的协同过滤评估结果:")
    print(f"精确率: {user_cf_results['precision']:.3f}")
    print(f"召回率: {user_cf_results['recall']:.3f}")
    print(f"NDCG: {user_cf_results['ndcg']:.3f}")
    print(f"覆盖用户数: {user_cf_results['coverage']}")
    
    # 6. 可视化评估结果
    metrics = ['精确率', '召回率', 'NDCG']
    values = [user_cf_results['precision'], user_cf_results['recall'], user_cf_results['ndcg']]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(metrics, values, color=['skyblue', 'lightgreen', 'lightcoral'])
    
    # 添加数值标签
    for bar, value in zip(bars, values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{value:.3f}', ha='center', va='bottom')
    
    plt.title('推荐系统评估指标')
    plt.ylabel('指标值')
    plt.ylim(0, 1)
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        'train_data': train_data,
        'test_data': test_data,
        'user_cf_results': user_cf_results
    }

# 运行推荐系统评估示例
evaluation_results = recommendation_evaluation()

今日总结

今天我们学习了推荐系统的基础知识:

1. 推荐系统基础:概念、类型、应用领域

2. 协同过滤算法:基于用户和物品的协同过滤

3. 基于内容的推荐:特征提取、用户偏好建模

4. 矩阵分解方法:NMF、SVD分解

5. 推荐系统评估:精确率、召回率、NDCG等指标

推荐系统是现代互联网应用的核心技术,掌握这些算法可以构建个性化推荐服务。

控制面板
您好,欢迎到访网站!
  查看权限
网站分类
最新留言