From 0c897dc3ce81f38ab36ba92699769a2684502b79 Mon Sep 17 00:00:00 2001 From: Travis Fischer Date: Mon, 26 Aug 2024 16:17:59 -0500 Subject: [PATCH] feat: WIP add evals and telemetry abstractions --- packages/core/package.json | 1 + packages/core/src/telemetry/index.ts | 2 + packages/core/src/telemetry/noop-tracer.ts | 81 +++++++++ packages/core/src/telemetry/telemetry.ts | 188 +++++++++++++++++++++ packages/evals/package.json | 48 ++++++ packages/evals/src/index.ts | 1 + packages/evals/src/types.ts | 104 ++++++++++++ packages/evals/tsconfig.json | 5 + pnpm-lock.yaml | 19 +++ 9 files changed, 449 insertions(+) create mode 100644 packages/core/src/telemetry/index.ts create mode 100644 packages/core/src/telemetry/noop-tracer.ts create mode 100644 packages/core/src/telemetry/telemetry.ts create mode 100644 packages/evals/package.json create mode 100644 packages/evals/src/index.ts create mode 100644 packages/evals/src/types.ts create mode 100644 packages/evals/tsconfig.json diff --git a/packages/core/package.json b/packages/core/package.json index 9a759bffd..c28dd6002 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -32,6 +32,7 @@ "test:unit": "vitest run" }, "dependencies": { + "@opentelemetry/api": "^1.9.0", "@sindresorhus/is": "^7.0.0", "dedent": "^1.5.3", "delay": "^6.0.0", diff --git a/packages/core/src/telemetry/index.ts b/packages/core/src/telemetry/index.ts new file mode 100644 index 000000000..f9beaf492 --- /dev/null +++ b/packages/core/src/telemetry/index.ts @@ -0,0 +1,2 @@ +export * from './noop-tracer' +export * from './telemetry' diff --git a/packages/core/src/telemetry/noop-tracer.ts b/packages/core/src/telemetry/noop-tracer.ts new file mode 100644 index 000000000..eac27ddd1 --- /dev/null +++ b/packages/core/src/telemetry/noop-tracer.ts @@ -0,0 +1,81 @@ +import type { Span, SpanContext, Tracer } from '@opentelemetry/api' + +/** + * Tracer implementation that does nothing. + */ +export const noopTracer: Tracer = { + startSpan(): Span { + return noopSpan + }, + + startActiveSpan unknown>( + _name: unknown, + arg1: unknown, + arg2?: unknown, + arg3?: F + ): any { + if (typeof arg1 === 'function') { + return arg1(noopSpan) + } + + if (typeof arg2 === 'function') { + return arg2(noopSpan) + } + + if (typeof arg3 === 'function') { + return arg3(noopSpan) + } + } +} + +const noopSpan: Span = { + spanContext() { + return noopSpanContext + }, + + setAttribute() { + return this + }, + + setAttributes() { + return this + }, + + addEvent() { + return this + }, + + addLink() { + return this + }, + + addLinks() { + return this + }, + + setStatus() { + return this + }, + + updateName() { + return this + }, + + end() { + return this + }, + + isRecording() { + return false + }, + + recordException() { + return this + } +} + +const noopSpanContext: SpanContext = { + traceId: '', + spanId: '', + traceFlags: 0 +} diff --git a/packages/core/src/telemetry/telemetry.ts b/packages/core/src/telemetry/telemetry.ts new file mode 100644 index 000000000..be852a76b --- /dev/null +++ b/packages/core/src/telemetry/telemetry.ts @@ -0,0 +1,188 @@ +import { + type Attributes, + type AttributeValue, + type Span, + type SpanOptions, + SpanStatusCode, + trace, + type Tracer +} from '@opentelemetry/api' + +import type * as types from '../types' +import { noopTracer } from './noop-tracer' + +export type AgenticSpanOptions = { + attributes?: { + [attributeKey: string]: + | AttributeValue + | { input: () => AttributeValue | undefined } + | { output: () => AttributeValue | undefined } + | undefined + } +} + +export class Telemetry { + public readonly isEnabled: boolean + public readonly tracer: Tracer + public readonly recordInputs: boolean + public readonly recordOutputs: boolean + public readonly metadata: Record + + constructor({ + tracer, + isEnabled = true, + recordInputs = true, + recordOutputs = true, + metadata = {} + }: { + tracer?: Tracer + + /** + * Enable or disable telemetry. Disabled by default. + */ + isEnabled?: boolean + + /** + * Enable or disable input recording. Enabled by default. + * + * You might want to disable input recording to avoid recording sensitive + * information, to reduce data transfers, or to increase performance. + */ + recordInputs?: boolean + + /** + * Enable or disable output recording. Enabled by default. + * + * You might want to disable output recording to avoid recording sensitive + * information, to reduce data transfers, or to increase performance. + */ + recordOutputs?: boolean + + /** + * Additional information to include in the telemetry data. + */ + metadata?: Record + }) { + this.isEnabled = !!isEnabled + this.tracer = + tracer ?? (this.isEnabled ? trace.getTracer('agentic') : noopTracer) + this.recordInputs = recordInputs + this.recordOutputs = recordOutputs + this.metadata = metadata + } + + recordSpan( + { + name, + attributes = {}, + endWhenDone = true, + ...spanOptions + }: { + name: string + endWhenDone?: boolean + } & Omit & + AgenticSpanOptions, + implementation: (span: Span) => types.MaybePromise + ): Promise { + const spanAttributes = this.convertAttributes({ attributes }) + + return this.tracer.startActiveSpan( + name, + { + ...spanOptions, + attributes: spanAttributes + }, + async (span) => { + try { + const result: Awaited = await Promise.resolve(implementation(span)) + + if (endWhenDone) { + span.end() + } + + return result + } catch (err) { + try { + if (err instanceof Error) { + span.recordException({ + name: err.name, + message: err.message, + stack: err.stack + }) + + span.setStatus({ + code: SpanStatusCode.ERROR, + message: err.message + }) + } else { + span.setStatus({ code: SpanStatusCode.ERROR }) + } + } finally { + // Always end the span when there is an error. + span.end() + } + + throw err + } + } + ) + } + + convertAttributes({ attributes = {} }: AgenticSpanOptions): Attributes { + return { + ...Object.fromEntries( + Object.entries(attributes) + .map(([key, value]) => { + if (value === undefined) { + return [key, value] + } + + // input value, check if it should be recorded: + if ( + typeof value === 'object' && + 'input' in value && + typeof value.input === 'function' + ) { + if (!this.recordInputs) { + return undefined + } + + const result = value.input() + if (result === undefined) { + return undefined + } else { + return [key, result] + } + } + + // output value, check if it should be recorded: + if ( + typeof value === 'object' && + 'output' in value && + typeof value.output === 'function' + ) { + if (!this.recordOutputs) { + return undefined + } + + const result = value.output() + if (result === undefined) { + return undefined + } else { + return [key, result] + } + } + + return [key, value] + }) + .filter(Boolean) + ), + + ...Object.fromEntries( + Object.entries(this.metadata).map(([key, value]) => { + return [`agentic.telemetry.metadata.${key}`, value] + }) + ) + } + } +} diff --git a/packages/evals/package.json b/packages/evals/package.json new file mode 100644 index 000000000..700111a14 --- /dev/null +++ b/packages/evals/package.json @@ -0,0 +1,48 @@ +{ + "name": "@agentic/evals", + "version": "0.1.0", + "description": "TODO", + "author": "Travis Fischer ", + "license": "MIT", + "repository": { + "type": "git", + "url": "git+https://github.com/transitive-bullshit/agentic.git" + }, + "type": "module", + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "sideEffects": false, + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "default": "./dist/index.js" + } + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsup --config ../../tsup.config.ts", + "dev": "tsup --config ../../tsup.config.ts --watch", + "clean": "del dist", + "test": "run-s test:*", + "test:lint": "eslint .", + "test:typecheck": "tsc --noEmit", + "test:unit": "vitest run" + }, + "dependencies": { + "type-fest": "^4.21.0" + }, + "peerDependencies": { + "@agentic/core": "workspace:*", + "zod": "^3.23.8" + }, + "devDependencies": { + "@agentic/core": "workspace:*", + "@agentic/tsconfig": "workspace:*" + }, + "publishConfig": { + "access": "public" + } +} diff --git a/packages/evals/src/index.ts b/packages/evals/src/index.ts new file mode 100644 index 000000000..70b786d12 --- /dev/null +++ b/packages/evals/src/index.ts @@ -0,0 +1 @@ +// TODO diff --git a/packages/evals/src/types.ts b/packages/evals/src/types.ts new file mode 100644 index 000000000..1d51967d3 --- /dev/null +++ b/packages/evals/src/types.ts @@ -0,0 +1,104 @@ +import type { JsonArray, JsonValue } from 'type-fest' + +export type BaseMeta = Record + +// TODO +export type Dataset = {} + +export namespace Judge { + /** The score of the Task's output. */ + export type Score = { + name: string + score: number + meta?: Meta + } + + /** Judge arguments. */ + export type Args = { + input?: Input + output: Output + expected?: Expected + } + + /** Base/generic Judge. */ + export type BaseJudge< + Input, + Output, + Expected, + Meta extends BaseMeta = BaseMeta + > = ( + args: Args + ) => Promise> | Score + + /** Compare strings. */ + export type String = BaseJudge + + /** Compare numbers. */ + export type Number = BaseJudge + + /** Compare Boolean values. */ + export type Boolean = BaseJudge + + /** Compare JSON values. */ + export type JSON = BaseJudge + + /** Compare arrays of JSON values. */ + export type Array = BaseJudge + + /** Compare any values for equality. */ + export type Equality = BaseJudge +} + +export namespace Experiment { + /** The task to be evaluated. */ + export type Task = (input: Input) => Promise + + /** Example data used to run evaluations. */ + export interface Example { + input: Input + output?: Output + expected?: Expected + } + + /** Records are example and metadata used by experiments. */ + export interface Record< + Input, + Output, + Expected, + Meta extends BaseMeta = BaseMeta + > { + id: string + createdAt: Date + updatedAt: Date | null + tags: string[] + metadata: Meta | null + archived: boolean + example: Example + revisionCount: number + } + + /** Perform evaluation on a single example. */ + export type Evaluator = ( + record: Record + ) => Promise<{ + output: Output + scores: Judge.Score[] + sentryTraceId?: string + }> + + /** The result of evaluating a single example. */ + export type EvalResult = { + record: Record + output: Output + scores: Judge.Score[] + } + + /** + * Perform evaluation on a batch of records. + * The records arg is optional because the experiment can be initialized + * with a function to call to get the records. + */ + export type Experiment = ( + records?: Record[] + ) => Promise +} diff --git a/packages/evals/tsconfig.json b/packages/evals/tsconfig.json new file mode 100644 index 000000000..6c8d720c7 --- /dev/null +++ b/packages/evals/tsconfig.json @@ -0,0 +1,5 @@ +{ + "extends": "@agentic/tsconfig/base.json", + "include": ["src"], + "exclude": ["node_modules", "dist"] +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9128894b2..95616129a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -315,6 +315,9 @@ importers: packages/core: dependencies: + '@opentelemetry/api': + specifier: ^1.9.0 + version: 1.9.0 '@sindresorhus/is': specifier: ^7.0.0 version: 7.0.0 @@ -434,6 +437,22 @@ importers: specifier: ^0.0.8 version: 0.0.8(bufferutil@4.0.8)(utf-8-validate@6.0.4) + packages/evals: + dependencies: + type-fest: + specifier: ^4.21.0 + version: 4.25.0 + zod: + specifier: ^3.23.8 + version: 3.23.8 + devDependencies: + '@agentic/core': + specifier: workspace:* + version: link:../core + '@agentic/tsconfig': + specifier: workspace:* + version: link:../tsconfig + packages/exa: dependencies: ky: