Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add create LoDTensor from list option and simplify recommender book example #10946

Merged
merged 2 commits into from
May 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions python/paddle/fluid/lod_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ def _convert_lod(lod):


def create_lod_tensor(data, lod, place):
"""Create a lod tensor from a numpy array or an existing lod tensor.
"""Create a lod tensor from a numpy array, a list, or an existing lod tensor.

Create a lod tensor by doing the following:
1. Check that the length-based input lod is valid.
2. Convert the length-based lod to a offset-based LoD.
3. Copy the data from a numpy array or a existing lod tensor to
3. Copy the data from a numpy array, a list or a existing lod tensor to
CPU or GPU device (based on input place).
4. Set the level of detail (LoD) using the offset-based LoD.

Expand All @@ -117,7 +117,7 @@ def create_lod_tensor(data, lod, place):
for more details regarding LoD.

Args:
data: a numpy array or a LoDTensor holding the data to be copied.
data: a numpy array or a LoDTensor or a list holding the data to be copied.
lod: a list of lists indicating the length-based LoD info specified by the user.
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.

Expand All @@ -126,6 +126,18 @@ def create_lod_tensor(data, lod, place):
"""
if isinstance(data, core.LoDTensor):
return create_lod_tensor(np.array(data), lod, place)
elif isinstance(data, list):
# When input data is a list, it only deal with the case where the base element
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deals

# is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
# LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
# of words or other indexes in the sequence.
new_lod = []
for seq in data:
new_lod.append(len(seq))
assert [new_lod] == lod, "data and lod do not match"
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
return create_lod_tensor(flattened_data, lod, place)
elif isinstance(data, np.ndarray):
assert _validate_lod(lod,
data.shape[0]), "the provided lod info is invalid"
Expand All @@ -134,9 +146,8 @@ def create_lod_tensor(data, lod, place):
tensor.set_lod(_convert_lod(lod))
return tensor
else:
raise Exception(
"data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
% (type(data)))
raise TypeError(
"data should be either a LoDTensor, a Numpy array or a list")


def create_random_int_lodtensor(lod, base_shape, place, low, high):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,43 +197,30 @@ def event_handler(event):
num_epochs=1,
event_handler=event_handler,
reader=train_reader,
feed_order=[
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id',
'category_id', 'movie_title', 'score'
])
feed_order=feed_order)


def infer(use_cuda, inference_program, save_path):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
inferencer = fluid.Inferencer(
inference_program, param_path=save_path, place=place)

def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)

flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor

# Generate a random input for inference
user_id = create_lod_tensor([[1]])
gender_id = create_lod_tensor([[1]])
age_id = create_lod_tensor([[0]])
job_id = create_lod_tensor([[10]])
movie_id = create_lod_tensor([[783]])
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
# Use the first data from paddle.dataset.movielens.test() as input.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are touching this inference, I raised an issue here, but it is not related to this PR 🤣

# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
# where `data` is a list of sequences of index numbers, `lod` is
# the level of detail (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
# indicating that `data` consists of two sequences of length 3 and 2.
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
place)

results = inferencer.infer(
{
Expand Down
96 changes: 28 additions & 68 deletions python/paddle/fluid/tests/book/test_recommender_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
test_reader = paddle.batch(
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)

feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}

def func_feed(feeding, data):
feed_tensors = {}
for (key, idx) in feeding.iteritems():
tensor = fluid.LoDTensor()
if key != "category_id" and key != "movie_title":
if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"float32")
else:
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"int64")
else:
numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
data)
lod_info = [len(item) for item in numpy_data]
offset = 0
lod = [offset]
for item in lod_info:
offset += item
lod.append(offset)
numpy_data = np.concatenate(numpy_data, axis=0)
tensor.set_lod([lod])

numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
tensor.set(numpy_data, place)
feed_tensors[key] = tensor
return feed_tensors
feed_order = [
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
'movie_title', 'score'
]

def train_loop(main_program):
exe.run(framework.default_startup_program())

feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)

PASS_NUM = 100
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
# train a mini-batch
outs = exe.run(program=main_program,
feed=func_feed(feeding, data),
feed=feeder.feed(data),
fetch_list=[avg_cost])
out = np.array(outs[0])
if (batch_id + 1) % 10 == 0:
avg_cost_set = []
for test_data in test_reader():
avg_cost_np = exe.run(
program=test_program,
feed=func_feed(feeding, test_data),
fetch_list=[avg_cost])
avg_cost_np = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[avg_cost])
avg_cost_set.append(avg_cost_np[0])
break # test only 1 segment for speeding up CI

Expand Down Expand Up @@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)

def create_lod_tensor(data, lod=None):
tensor = fluid.LoDTensor()
if lod is None:
# Tensor, the shape is [batch_size, 1]
index = 0
lod_0 = [index]
for l in range(len(data)):
index += 1
lod_0.append(index)
lod = [lod_0]
tensor.set_lod(lod)

flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
tensor.set(flattened_data, place)
return tensor

inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
# Use fluid.io.load_inference_model to obtain the inference program desc,
Expand All @@ -307,26 +260,33 @@ def create_lod_tensor(data, lod=None):

# Use the first data from paddle.dataset.movielens.test() as input
assert feed_target_names[0] == "user_id"
user_id = create_lod_tensor([[1]])
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
# where `data` is a list of sequences of index numbers, `lod` is
# the level of detail (lod) info associated with `data`.
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
# two sequences of indexes, of length 3 and 2, respectively.
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
# indicating that `data` consists of two sequences of length 3 and 2.
user_id = fluid.create_lod_tensor([[1]], [[1]], place)

assert feed_target_names[1] == "gender_id"
gender_id = create_lod_tensor([[1]])
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)

assert feed_target_names[2] == "age_id"
age_id = create_lod_tensor([[0]])
age_id = fluid.create_lod_tensor([[0]], [[1]], place)

assert feed_target_names[3] == "job_id"
job_id = create_lod_tensor([[10]])
job_id = fluid.create_lod_tensor([[10]], [[1]], place)

assert feed_target_names[4] == "movie_id"
movie_id = create_lod_tensor([[783]])
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)

assert feed_target_names[5] == "category_id"
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)

assert feed_target_names[6] == "movie_title"
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
[[0, 5]])
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
[[5]], place)

# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
Expand Down
11 changes: 7 additions & 4 deletions python/paddle/fluid/tests/test_lod_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,14 @@ def test_convert_lod(self):
self.assertEqual(_convert_lod(lod), converted_lod)

def test_create_lod_tensor(self):
# Only numpy array or a fluid LoDTensor is valid input to
# create_lod_tensor function, currently a list of lists is not.
data = [[1, 2], [3, 4]]
self.assertRaises(Exception, create_lod_tensor, data, [],
# Create LoDTensor from a list
data = [[1, 2, 3], [3, 4]]
wrong_lod = [[2, 2]]
correct_lod = [[3, 2]]
self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
fluid.CPUPlace())
tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
self.assertEqual(tensor.lod(), [[0, 3, 5]])

# Create LoDTensor from numpy array
data = numpy.random.random([10, 1])
Expand Down