Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add origin stack trace capture for DALI operators #5302

Merged
merged 34 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
b6e68a8
Remove the TF implementation files
klecki Feb 15, 2024
fbde70b
Reimplement parts of tf_stack with pure Python. Enable stack remapping
klecki Jan 31, 2024
f1f256c
Introduce draft of the extract_stack reworked for DALI purposes
klecki Jan 31, 2024
4ea7c48
Try to propagate original source_code_line in the source maps
klecki Feb 1, 2024
72db778
Add extraction code
klecki Feb 1, 2024
26b5360
Add ability for using plugin operators for python test
klecki Feb 1, 2024
435f928
Some linting
klecki Feb 1, 2024
84e110a
WIP
klecki Feb 14, 2024
62ac62b
Filtering seems to work
klecki Feb 14, 2024
049404f
Something is close to working
klecki Feb 15, 2024
d8d1118
Clean up the test
klecki Feb 16, 2024
66a239b
TMP work
klecki Feb 16, 2024
0d3ecd5
Maybe we just don't filter _conditionals.py
klecki Feb 16, 2024
f7609f0
it seems to work a bit better
klecki Feb 19, 2024
ad86e14
Almost API handling
klecki Feb 19, 2024
c653db1
It works :)
klecki Feb 20, 2024
1d6f559
Comments
klecki Feb 20, 2024
1d8131d
CLeanup
klecki Feb 20, 2024
65ee7af
Cleanup
klecki Feb 20, 2024
05ae4de
CLeanip
klecki Feb 20, 2024
79ae572
Make sure it works without current pipeline
klecki Feb 20, 2024
ba02824
Adjust test, change how we track the stack trace depth
klecki Feb 23, 2024
4c69791
Fixup
klecki Feb 23, 2024
c9139d7
Test and fixes
klecki Feb 23, 2024
e65686f
Review
klecki Feb 23, 2024
49f55b6
Format
klecki Feb 23, 2024
41ed12b
Test fixes
klecki Feb 26, 2024
73cbbc6
Trim whitespace
klecki Feb 26, 2024
c3357a8
Fixup
klecki Feb 26, 2024
36670b2
Review
klecki Feb 26, 2024
c2dee1f
Fix and improvement
klecki Feb 28, 2024
176f1a7
Review
klecki Feb 28, 2024
76a96e0
Naming and comments
klecki Mar 4, 2024
b909d6f
It should work with trailing "/"
klecki Mar 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions dali/pipeline/operator/error_reporting.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <sstream>
#include <string>
#include <utility>
#include <vector>

#include "dali/pipeline/operator/error_reporting.h"
#include "dali/pipeline/operator/op_spec.h"

namespace dali {

std::vector<PythonStackFrame> GetOperatorOriginInfo(const OpSpec &spec) {
auto origin_stack_filename = spec.GetRepeatedArgument<std::string>("_origin_stack_filename");
auto origin_stack_lineno = spec.GetRepeatedArgument<int>("_origin_stack_lineno");
auto origin_stack_name = spec.GetRepeatedArgument<std::string>("_origin_stack_name");
auto origin_stack_line = spec.GetRepeatedArgument<std::string>("_origin_stack_line");

std::vector<PythonStackFrame> origin_stack;
origin_stack.reserve(origin_stack_filename.size());
stiepan marked this conversation as resolved.
Show resolved Hide resolved
const char error[] = "Internal error, mismatch in origin stack trace data.";
DALI_ENFORCE(origin_stack_filename.size() == origin_stack_lineno.size(), error);
DALI_ENFORCE(origin_stack_filename.size() == origin_stack_name.size(), error);
DALI_ENFORCE(origin_stack_filename.size() == origin_stack_line.size(), error);
for (size_t i = 0; i < origin_stack_filename.size(); i++) {
origin_stack.emplace_back(std::move(origin_stack_filename[i]), origin_stack_lineno[i],
std::move(origin_stack_name[i]), std::move(origin_stack_line[i]));
}
return origin_stack;
}


std::string FormatStack(const std::vector<PythonStackFrame> &stack_summary, bool include_context) {
std::stringstream s;
for (auto &frame_summary : stack_summary) {
s << " File \"" << frame_summary.filename << "\", line " << frame_summary.lineno << ", in "
<< frame_summary.name << "\n";
if (include_context) {
s << " " << frame_summary.line << "\n";
}
}
return s.str();
}

} // namespace dali
64 changes: 64 additions & 0 deletions dali/pipeline/operator/error_reporting.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef DALI_PIPELINE_OPERATOR_ERROR_REPORTING_H_
#define DALI_PIPELINE_OPERATOR_ERROR_REPORTING_H_

#include <string>
#include <utility>
#include <vector>

#include "dali/core/api_helper.h"
#include "dali/pipeline/data/types.h"
#include "dali/pipeline/operator/op_spec.h"
namespace dali {

// TODO(klecki): Throw this one into a namespace?

/**
* @brief Direct equivalent of Python's traceback.FrameSummary:
* https://docs.python.org/3/library/traceback.html#traceback.FrameSummary
* Describes a stack frame in the Python stack trace.
*/
struct PythonStackFrame {
PythonStackFrame(std::string filename, int lineno, std::string name, std::string line)
: filename(std::move(filename)),
lineno(lineno),
name(std::move(name)),
line(std::move(line)) {}
/** @brief File name of the source code executed for this frame. */
std::string filename;
/** @brief The line number of the source code for this frame. */
int lineno;
/** @brief Name of the function being executed in this frame. */
std::string name;
mzient marked this conversation as resolved.
Show resolved Hide resolved
/** @brief A string representing the source code for this frame, with leading and trailing
* whitespace stripped. */
std::string line;
mzient marked this conversation as resolved.
Show resolved Hide resolved
};

/**
* @brief Get the origin stack trace for operator constructed with given spec.
* The stack trace defines frames between invocation of the pipeline definition and the operator
* call. The returned PythonStackFrame corresponds to Python traceback.FrameSummary, but the `line`
* context may be invalid in some autograph transformed code.
*/
DLL_PUBLIC std::vector<PythonStackFrame> GetOperatorOriginInfo(const OpSpec &spec);

DLL_PUBLIC std::string FormatStack(const std::vector<PythonStackFrame> &stack_summary,
bool include_context);

} // namespace dali

#endif // DALI_PIPELINE_OPERATOR_ERROR_REPORTING_H_
33 changes: 32 additions & 1 deletion dali/pipeline/operator/op_schema.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -78,6 +78,37 @@ to accommodate a batch of samples of this size.)code",
AddOptionalArg("preserve", R"code(Prevents the operator from being removed from the
graph even if its outputs are not used.)code",
false);


// For simplicity we pass StackSummary as 4 separate arguments so we don't need to extend DALI
// with support for special FrameSummary type.
// List of FrameSummaries can be reconstructed using utility functions.
AddOptionalArg("_origin_stack_filename", R"code(Every operator defined in Python captures and
processes the StackSummary (a List[FrameSummary], defined in Python traceback module) that describes
the callstack between the start of pipeline definition tracing and the "call" to the operator
mzient marked this conversation as resolved.
Show resolved Hide resolved
(or full trace if the operator is defined outside the pipeline).
This information is propagated to the backend, so it can be later used to produce meaningful error
messages, pointing to the origin of the error in pipeline definition.

The list of FrameSummaries is split into four parameters: each is the list containing corresponding
parameters of FrameSummary. This parameter represents the `filename` member.)code",
std::vector<std::string>{});

AddOptionalArg("_origin_stack_lineno", R"code(StackSummary - lineno member of FrameSummary, see
_origin_stack_filename for more information.)code",
std::vector<int>{});

AddOptionalArg("_origin_stack_name", R"code(StackSummary - name member of FrameSummary, see
_origin_stack_filename for more information.)code",
std::vector<std::string>{});

AddOptionalArg("_origin_stack_line", R"code(StackSummary - line member of FrameSummary, see
_origin_stack_filename for more information.)code",
std::vector<std::string>{});

AddOptionalArg("_pipeline_internal", R"code(Boolean specifying if this operator was defined within
a pipeline scope. False if it was defined without pipeline being set as current.)code",
true);
}


Expand Down
3 changes: 3 additions & 0 deletions dali/python/nvidia/dali/_autograph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
from nvidia.dali._autograph.impl.api import do_not_convert
from nvidia.dali._autograph.impl.api import autograph_artifact
from nvidia.dali._autograph.impl.api import is_autograph_artifact
from nvidia.dali._autograph.impl.api import is_frame_ag_call_entrypoint
from nvidia.dali._autograph.impl.api import is_frame_ag_call_unconverted

# from nvidia.dali._autograph.impl.api import StackTraceMapper
from nvidia.dali._autograph.impl.api import to_code
Expand All @@ -52,6 +54,7 @@
from nvidia.dali._autograph.utils import ag_logging
from nvidia.dali._autograph.utils.all_utils import _remove_undocumented
from nvidia.dali._autograph.utils.hooks import OperatorBase
from nvidia.dali._autograph.utils.tf_stack import CustomModuleFilter

# TODO(mdan): Revisit this list once we finalize the generated code mechanism.
_allowed_symbols = [
Expand Down
108 changes: 68 additions & 40 deletions dali/python/nvidia/dali/_autograph/impl/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@
from nvidia.dali._autograph.utils import ag_logging as logging
from nvidia.dali._autograph.utils import all_utils

# TODO(klecki): replace missing functionality
# from nvidia.dali._autograph.utils import tf_stack
from nvidia.dali._autograph.utils import tf_stack
from nvidia.dali._autograph.utils.all_utils import export_symbol


Expand Down Expand Up @@ -130,40 +129,47 @@
e.ag_error_metadata = _ErrorMetadata(cause_tb, metadata, message, source_map, __file__)


# class StackTraceMapper(tf_stack.StackTraceMapper):
# """Remaps generated code to code it originated from."""
class StackTraceMapper(tf_stack.StackTraceMapper):
"""Remaps generated code to code it originated from."""

# def __init__(self, converted_fn):
# super().__init__()
# self._source_map = converted_fn.ag_source_map
# # This may be called repeatedly: once on entry, by the superclass, then by
# # each child context manager.
# self._cached_map = None
def __init__(self, converted_fn):
super().__init__()
self._source_map = converted_fn.ag_source_map
# This may be called repeatedly: once on entry, by the superclass, then by
# each child context manager.
mzient marked this conversation as resolved.
Show resolved Hide resolved
self._cached_map = None

# def get_effective_source_map(self):
# if self._cached_map is not None:
# return self._cached_map
def get_effective_source_map(self):
if self._cached_map is not None:
return self._cached_map

# parent_map = self.parent.get_effective_source_map()
parent_map = self.parent.get_effective_source_map()

# effective_source_map = {}
# for loc, origin in self._source_map.items():
# effective_source_map[(loc.filename, loc.lineno)] = (origin.loc.filename,
# origin.loc.lineno,
# origin.function_name)
effective_source_map = {}
for loc, origin in self._source_map.items():
effective_source_map[(loc.filename, loc.lineno)] = (
origin.loc.filename,
origin.loc.lineno,
origin.function_name,
origin.source_code_line,
)

# for key, value in parent_map.items():
# filename, lineno, _ = value
# value_loc = origin_info.LineLocation(filename=filename, lineno=lineno)
# if value_loc in self._source_map:
# origin = self._source_map[value_loc]
# effective_source_map[key] = (origin.loc.filename, origin.loc.lineno,
# origin.function_name)
# else:
# effective_source_map[key] = value
for key, value in parent_map.items():
filename, lineno, _, _ = value
value_loc = origin_info.LineLocation(filename=filename, lineno=lineno)
if value_loc in self._source_map:
origin = self._source_map[value_loc]
effective_source_map[key] = (
origin.loc.filename,
origin.loc.lineno,
origin.function_name,
origin.source_code_line,
)
else:
effective_source_map[key] = value

# self._cached_map = effective_source_map
# return effective_source_map
self._cached_map = effective_source_map
return effective_source_map


#
Expand Down Expand Up @@ -283,6 +289,24 @@
return hasattr(entity, "autograph_info__")


def is_frame_ag_call_entrypoint(frame_info):
"""
True if the given frame is start of a function call wrapped by AutoGraph (ag__.converted_call)
"""
return (
frame_info.filename.endswith("nvidia/dali/_autograph/impl/api.py")
and frame_info.name == "converted_call"
)


def is_frame_ag_call_unconverted(frame_info):
"""True if the given frame exits autograph to call unconverted user code."""
return (
frame_info.filename.endswith("nvidia/dali/_autograph/impl/api.py")
and frame_info.name == "_call_unconverted"
)


def converted_call(f, args, kwargs, caller_fn_scope=None, options=None):
"""Converts a function call inline.

Expand Down Expand Up @@ -418,16 +442,20 @@
raise
return _fall_back_unconverted(f, args, kwargs, options, e)

# TODO(klecki): Revert the stack trace mapping functionality
# with StackTraceMapper(converted_f), tf_stack.CurrentModuleFilter():
try:
if kwargs is not None:
result = converted_f(*effective_args, **kwargs)
else:
result = converted_f(*effective_args)
except Exception as e:
_attach_error_metadata(e, converted_f)
raise
# We no longer need CurrentModuleFilter here, as we filter whole autograph
# TODO(klecki): Filter them just once.
import nvidia.dali._conditionals as dc
import nvidia.dali._autograph as ag

Check notice

Code scanning / CodeQL

Module is imported with 'import' and 'import from' Note

Module 'nvidia.dali._autograph' is imported with both 'import' and 'import from'.
Module '_autograph' is imported with both 'import' and 'import from'.
Fixed Show fixed Hide fixed

with StackTraceMapper(converted_f), tf_stack.CustomModuleFilter([ag, dc]):
try:
if kwargs is not None:
result = converted_f(*effective_args, **kwargs)
else:
result = converted_f(*effective_args)
except Exception as e:
_attach_error_metadata(e, converted_f)
raise

return result

Expand Down
Loading
Loading