Data Science Asked by TinaW on January 4, 2021
LightFm has two methods to predict: predict()
and predict_rank()
. The evaluation function precision_at_k
is based on the predict_rank
function. Since I have many items to rank for each user, the predict
method is more suitable/faster. Hence, I tried to replicate the precision@k score resulting from the precision_at_k method using the predict method.
Clearly, whether one is using predict_rank or predict should not change the precision@k score, but I was unable to replicate the score I get from precision_at_k
(based on predict_rank
) with the predict
method.
In fact the evaluation scores from the predict
method are always worse than the evaluation scores derived by the precision_at_k
method included in the package. Why is that?
Below is an example using open source data.
For simplicity, I’m using only a fraction of the data, a basic model without features, known positives are not removed (train_data argument is not specified in precision_at_k
).
Why is this important: I want to calculate ndcg for evaluation and if I can replicate the prec@k score with predict, I know the post-processing of the predictions is correctly set up and I can just change the metric.
from lightfm import LightFM
from scipy.sparse import coo_matrix as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
import csv
import requests
import json
from itertools import islice
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split
######################################
#
# Fetching the training data
#
######################################
def _download(url: str, dest_path: str):
req = requests.get(url, stream=True)
req.raise_for_status()
with open(dest_path, "wb") as fd:
for chunk in req.iter_content(chunk_size=2 ** 20):
fd.write(chunk)
def get_data():
ratings_url = ("http://www2.informatik.uni-freiburg.de/" "~cziegler/BX/BX-CSV-Dump.zip")
if not os.path.exists("data"):
os.makedirs("data")
_download(ratings_url, "data/data.zip")
with zipfile.ZipFile("data/data.zip") as archive:
return (
csv.DictReader(
(x.decode("utf-8", "ignore") for x in archive.open("BX-Book-Ratings.csv")),
delimiter=";",
),
csv.DictReader(
(x.decode("utf-8", "ignore") for x in archive.open("BX-Books.csv")), delimiter=";"
),
csv.DictReader(
(x.decode("utf-8", "ignore") for x in archive.open("BX-Users.csv")), delimiter=";"
),
)
def get_ratings():
return get_data()[0]
def get_book_features():
return get_data()[1]
def get_user_features():
return get_data()[2]
# small dataset
udf = pd.DataFrame([x['User-ID'] for x in get_ratings()])
iid = pd.DataFrame([x['ISBN'] for x in get_ratings()])
frames = [udf, iid]
# susample user list
user_set = set([x['User-ID'] for x in get_ratings()])
user_samples = list(user_set)[:800]
train_df = pd.concat(frames, axis=1)
train_df.columns = ['user_id','item_id']
print(train_df.shape)
train_df = train_df[train_df.user_id.isin(user_samples)]
print(train_df.shape)
book_features = [(x['ISBN'], [x['Book-Author']]) for x in get_book_features() if x['ISBN'] in train_df.item_id.unique().tolist()]
user_features = [(x['User-ID'], [x['Age']]) for x in get_user_features() if x['User-ID'] in train_df.user_id.unique().tolist()]
dataset = Dataset()
dataset.fit(train_df.user_id.tolist(),
train_df.item_id.tolist())
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
dataset.fit_partial(users=train_df.user_id.tolist(),
items=train_df.item_id.tolist(),
item_features=[j[0] for i,j in book_features],
user_features=[j[0] for i,j in user_features])
#######################
#
# Building the Model
#
######################
dataset = Dataset()
dataset.fit(train_df.user_id.unique().tolist(),
train_df.item_id.unique().tolist())
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))
dataset.fit_partial(users=train_df.user_id.unique().tolist(),
items=train_df.item_id.unique().tolist(),
item_features=[j[0] for i,j in book_features],
user_features=[j[0] for i,j in user_features])
(interactions, weights) = dataset.build_interactions(((i,j) for i,j in zip(train_df.user_id, train_df.item_id)))
print(repr(interactions))
(train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2)
item_features = dataset.build_item_features((book_features))
print(repr(item_features))
user_features1 = dataset.build_user_features((user_features))
print(repr(user_features1))
mapp = dataset.mapping()
dict_user_id = mapp[0]
dict_item_id = mapp[2]
user_list = list(dict_user_id.keys())
items_list = list(dict_item_id.keys())
items =np.array(items_list)
data = {
'train_cols': items,
"train": train,
'test_cols': items,
"test": test,
"item_features": item_features,
"user_features": user_features1
}
#############################
#
# Training the Model
#
#############################
model = LightFM(loss='warp')
model.fit(data['train'],
#item_features=data['item_features'],
#user_features=data['user_features']
)
### model performnce evaluation
pak = precision_at_k(model,
test_interactions = data['test'],
#train_interactions = data['train'],
#item_features=data['item_features'],
#user_features=data['user_features']
).mean()
print("precision@10 : {}".format(pak))
This gives precision@10 : 0.004322766792029142.
Under the hood, the precision@k used the predict_rank method which generates the precision@k like this:
ranks = model.predict_rank(test_interactions=data['test'],
#train_interactions=data['train'],
#item_features=data['item_features'],
#user_features=data['user_features'],
num_threads=32,
check_intersections=True)
ranks.data = np.less(ranks.data, 10, ranks.data)
precision = np.squeeze(np.array(ranks.sum(axis=1))) / 10
precision = precision[data['test'].getnnz(axis=1) > 0]
print('prec@10: {}'.format(precision.mean()))
Just to demonstrate that this gives precision@10 : 0.004322766792029142.
If I now replicate the precision@k using the predict method I get a different result.
############################################
#
# Replicate precision using the predict method
#
############################################
mapp = dataset.mapping()
dict_user_id = mapp[0]
dict_item_id = mapp[2]
d_user_pred = {}
for user in dict_user_id.keys():
d_user_pred[user] = []
for uid, i in dict_user_id.items():
known_positives_ids = data['train_cols'][data['train'].tocsr()[i].indices]
#print('known positives:{}'.format(known_positives_ids))
scores = model.predict(user_ids = i,
item_ids = np.arange(len(dict_item_id)),
#user_features=user_features,
#item_features=item_features
)
# get top recommendations
top_items_ids = data['train_cols'][np.argsort(-scores)]
# exclude known positives from recommendations
top_items_ids = np.array(list(set(top_items_ids) - set(known_positives_ids)))
print('top_items_ids:{}'.format(top_items_ids[:5]))
d_user_pred[uid] = top_items_ids
##################################
#
# Precision@k evaluation
#
##################################
# get predictions df
df = pd.DataFrame.from_dict(d_user_pred, orient='index').iloc[:,:20]
df['user_id'] = df.index
df = df.melt(id_vars='user_id')
df.columns = ['user_id','rank','item_id']
pred_df = df.groupby('user_id').aggregate(lambda tdf: tdf.tolist()).reset_index()
pred_df.columns = ['user_id','rank','predictions']
# get ground truth df
t = pd.DataFrame(data['test'].todense(), columns=items_list)
t['user_id'] = user_list
t = t.melt(id_vars='user_id')
t = t[t.value==1].drop('value',axis=1)
t.columns = ['user_id','item_id']
actual_df = t.groupby('user_id').aggregate(lambda tdf: tdf.tolist()).reset_index()
actual_df.columns = ['user_id','actual']
# generate eval_df
eval_df = pred_df.merge(actual_df,on='user_id',how='left')
eval_df = eval_df[eval_df.actual.notnull()]
def precision(actual, predictions, k):
""" Fraction of retrieved documents @k that are relevant."""
return len(set(actual) & set(predictions[:k])) / k
eval_df['prec'] = eval_df.apply(lambda row : precision(actual=row['actual'],
predictions=row['predictions'],
k=10), axis = 1)
eval_df.prec.mean()
Which gives 0.0005763688760806917.
So in summary, the predict_rank gives precision@k score = 0.004322766792029142 and the predict method gives precision@k score=0.0005763688760806917. Why is that?
Using
top_items_ids = [item_id for item_id in top_items_ids if item_id not in known_positives_ids]
instead of
top_items_ids = np.array(list(set(top_items_ids) - set(known_positives_ids)))
resolves the discrepancy.
Correct answer by TinaW on January 4, 2021
Get help from others!
Recent Questions
Recent Answers
© 2024 TransWikia.com. All rights reserved. Sites we Love: PCI Database, UKBizDB, Menu Kuliner, Sharing RPP