Skip to content

Commit

Permalink
Merge pull request #91 from benfred/master
Browse files Browse the repository at this point in the history
add lastfm dataset
  • Loading branch information
erikbern committed Aug 6, 2018
2 parents bb98f12 + e1fa531 commit 83a95fa
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions ann_benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,45 @@ def word2bits(out_fn, path, fn):
write_output(X_train, X_test, out_fn, 'euclidean') # TODO: use hamming


def lastfm(out_fn, n_dimensions, test_size=50000):
# This tests out ANN methods for retrieval on simple matrix factorization based
# recommendation algorithms. The idea being that the query/test vectors are user factors
# and the train set are item factors from the matrix factorization model.

# Since the predictor is a dot product, we transform the factors first as described in this
# paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
# This hopefully replicates the experiments done in this post:
# http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/

# The dataset is from "Last.fm Dataset - 360K users":
# http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html

# this requires the implicit package to generate the factors (on my desktop/gpu this only
# takes 4-5 seconds to train - but could take 1-2 minutes on a laptop)
from implicit.datasets.lastfm import get_lastfm
from implicit.approximate_als import augment_inner_product_matrix
import implicit

# train an als model on the lastfm data
_, _, play_counts = get_lastfm()
model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8))

# transform item factors so that each one has the same norm, and transform the user
# factors such by appending a 0 column
_, item_factors = augment_inner_product_matrix(model.item_factors)
user_factors = numpy.append(model.user_factors,
numpy.zeros((model.user_factors.shape[0], 1)),
axis=1)

# only query the first 50k users (speeds things up signficantly without changing results)
user_factors = user_factors[:test_size]

# after that transformation a cosine lookup will return the same results as the inner product
# on the untransformed data
write_output(item_factors, user_factors, out_fn, 'angular')


DATASETS = {
'fashion-mnist-784-euclidean': fashion_mnist,
'gist-960-euclidean': gist,
Expand All @@ -241,4 +280,5 @@ def word2bits(out_fn, path, fn):
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
}

0 comments on commit 83a95fa

Please sign in to comment.