Small llms improvements (#400)

* Initial implementation * FIX keys * Add langfuse evals support * FIX trace upload * Delete .claude/settings.local.json Signed-off-by: soky srm <sokysrm@gmail.com> * Update client.rb * Small LLMs improvements * Keep batch size normal * Update categorizer * FIX json mode * Add reasonable alternative to matching * FIX thinking blocks for llms * Implement json mode support with AUTO mode * Make auto default for everyone * FIX linter * Address review * Allow export manual categories * FIX user export * FIX oneshot example pollution * Update categorization_golden_v1.yml * Update categorization_golden_v1.yml * Trim to 100 items * Update auto_categorizer.rb * FIX for auto retry in auto mode * Separate the Eval Logic from the Auto-Categorizer The expected_null_count parameter conflates eval-specific logic with production categorization logic. * Force json mode on evals * Introduce a more mixed dataset 150 items, performance from a local model: By Difficulty: easy: 93.22% accuracy (55/59) medium: 93.33% accuracy (42/45) hard: 92.86% accuracy (26/28) edge_case: 100.0% accuracy (18/18) * Improve datasets Remove Data leakage from prompts * Create eval runs as "pending" --------- Signed-off-by: soky srm <sokysrm@gmail.com> Signed-off-by: Juan José Mata <juanjo.mata@gmail.com> Co-authored-by: Juan José Mata <juanjo.mata@gmail.com>
2026-04-19 12:04:08 +00:00 · 2025-12-07 18:11:34 +01:00
parent bf90cad9a0
commit 88952e4714
34 changed files with 11027 additions and 42 deletions
--- a/app/models/eval/runners/base.rb
+++ b/app/models/eval/runners/base.rb
@@ -0,0 +1,82 @@
+class Eval::Runners::Base
+  attr_reader :eval_run
+
+  def initialize(eval_run)
+    @eval_run = eval_run
+  end
+
+  def run
+    eval_run.start!
+
+    begin
+      process_samples
+      metrics = calculate_metrics
+      eval_run.complete!(metrics)
+    rescue => e
+      eval_run.fail!(e)
+      raise
+    end
+
+    eval_run
+  end
+
+  protected
+
+    def process_samples
+      raise NotImplementedError, "Subclasses must implement #process_samples"
+    end
+
+    def calculate_metrics
+      raise NotImplementedError, "Subclasses must implement #calculate_metrics"
+    end
+
+    def samples
+      eval_run.dataset.samples
+    end
+
+    def provider
+      @provider ||= build_provider
+    end
+
+    def model
+      eval_run.model
+    end
+
+  private
+
+    def build_provider
+      case eval_run.provider
+      when "openai"
+        build_openai_provider
+      else
+        raise "Unsupported provider: #{eval_run.provider}"
+      end
+    end
+
+    def build_openai_provider
+      access_token = eval_run.provider_config["access_token"].presence ||
+                     ENV["OPENAI_ACCESS_TOKEN"].presence ||
+                     Setting.openai_access_token
+
+      raise "OpenAI access token not configured" unless access_token.present?
+
+      uri_base = eval_run.provider_config["uri_base"].presence ||
+                 ENV["OPENAI_URI_BASE"].presence ||
+                 Setting.openai_uri_base
+
+      Provider::Openai.new(access_token, uri_base: uri_base, model: model)
+    end
+
+    def record_result(sample:, actual_output:, correct:, **attributes)
+      eval_run.results.create!(
+        sample: sample,
+        actual_output: actual_output,
+        correct: correct,
+        **attributes
+      )
+    end
+
+    def log_progress(message)
+      Rails.logger.info("[Eval::Runner] #{message}")
+    end
+end
--- a/app/models/eval/runners/categorization_runner.rb
+++ b/app/models/eval/runners/categorization_runner.rb
@@ -0,0 +1,199 @@
+class Eval::Runners::CategorizationRunner < Eval::Runners::Base
+  DEFAULT_BATCH_SIZE = 25  # Matches Provider::Openai limit
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      batch_size = effective_batch_size
+      log_progress("Processing #{all_samples.size} samples in batches of #{batch_size}")
+
+      all_samples.each_slice(batch_size).with_index do |batch, batch_idx|
+        log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / batch_size).ceil}")
+        process_batch(batch)
+      end
+    end
+
+    # Use smaller batches for custom providers (local LLMs) to reduce context length
+    def effective_batch_size
+      eval_run.provider_config["batch_size"]&.to_i || DEFAULT_BATCH_SIZE
+    end
+
+    # Get JSON mode from provider config (optional override)
+    # Valid values: "strict", "json_object", "none"
+    def json_mode
+      eval_run.provider_config["json_mode"]
+    end
+
+    def calculate_metrics
+      Eval::Metrics::CategorizationMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_batch(batch_samples)
+      return if batch_samples.empty?
+
+      # Build inputs for the provider
+      transactions = batch_samples.map do |sample|
+        sample.to_transaction_input.merge(id: sample.id)
+      end
+
+      # Get categories from first sample's context (should be shared)
+      # Symbolize keys since Provider::Openai::AutoCategorizer expects symbol keys
+      categories = batch_samples.first.categories_context.map(&:deep_symbolize_keys)
+
+      # Determine effective JSON mode for this batch
+      # If the batch has many expected nulls and we're using auto mode, force strict mode
+      # to prevent the auto-categorizer from incorrectly retrying (it would see many nulls
+      # and think strict mode is broken, when actually the nulls are expected)
+      effective_json_mode = json_mode_for_batch(batch_samples)
+
+      start_time = Time.current
+
+      begin
+        response = provider.auto_categorize(
+          transactions: transactions,
+          user_categories: categories,
+          model: model,
+          json_mode: effective_json_mode
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+
+        if response.success?
+          record_batch_results(batch_samples, response.data, per_sample_latency)
+        else
+          record_batch_errors(batch_samples, response.error, per_sample_latency)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+        record_batch_errors(batch_samples, e, per_sample_latency)
+      end
+    end
+
+    def record_batch_results(batch_samples, categorizations, per_sample_latency)
+      batch_samples.each do |sample|
+        # Find the categorization result for this sample
+        categorization = categorizations.find { |c| c.transaction_id.to_s == sample.id.to_s }
+        actual_category = categorization&.category_name
+
+        # Normalize "null" string to nil
+        actual_category = nil if actual_category == "null"
+
+        expected_category = sample.expected_category_name
+        acceptable_categories = sample.all_acceptable_categories
+
+        # Evaluate correctness - check primary expected and alternatives
+        correct = evaluate_correctness_with_alternatives(actual_category, expected_category, acceptable_categories)
+        exact_match = actual_category == expected_category
+        alternative_match = acceptable_categories.include?(actual_category) && !exact_match
+        hierarchical = evaluate_hierarchical_match(actual_category, expected_category, sample)
+
+        record_result(
+          sample: sample,
+          actual_output: { "category_name" => actual_category },
+          correct: correct,
+          exact_match: exact_match,
+          alternative_match: alternative_match,
+          hierarchical_match: hierarchical,
+          null_expected: expected_category.nil?,
+          null_returned: actual_category.nil?,
+          latency_ms: per_sample_latency
+        )
+      end
+    end
+
+    def record_batch_errors(batch_samples, error, per_sample_latency)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      batch_samples.each do |sample|
+        record_result(
+          sample: sample,
+          actual_output: { "error" => error_message },
+          correct: false,
+          exact_match: false,
+          hierarchical_match: false,
+          null_expected: sample.expected_category_name.nil?,
+          null_returned: true,
+          latency_ms: per_sample_latency,
+          metadata: { "error" => error_message }
+        )
+      end
+    end
+
+    # Determine the effective JSON mode for a batch based on expected null ratio
+    # This prevents the auto-categorizer from incorrectly retrying when many nulls are expected
+    def json_mode_for_batch(batch_samples)
+      # If a specific mode is configured (not "auto"), always use it
+      return json_mode if json_mode.present? && json_mode != "auto"
+
+      # Calculate expected null ratio for this batch
+      expected_null_count = batch_samples.count { |s| s.expected_category_name.nil? }
+      expected_null_ratio = expected_null_count.to_f / batch_samples.size
+
+      # If >50% of the batch is expected to return null, force strict mode
+      # This matches the AUTO_MODE_NULL_THRESHOLD in the auto-categorizer
+      # and prevents unnecessary retries when nulls are legitimate
+      if expected_null_ratio > 0.5
+        log_progress("Batch has #{(expected_null_ratio * 100).round}% expected nulls, forcing strict mode to prevent false retry")
+        "strict"
+      else
+        # Use auto mode - let the auto-categorizer decide
+        "auto"
+      end
+    end
+
+    def evaluate_correctness(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = incorrect
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = incorrect
+      return false if actual.nil? && expected.present?
+      # Compare values
+      actual == expected
+    end
+
+    def evaluate_correctness_with_alternatives(actual, expected, acceptable_categories)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = incorrect
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = incorrect
+      return false if actual.nil? && expected.present?
+      # Check if actual matches any acceptable category (primary or alternatives)
+      acceptable_categories.include?(actual)
+    end
+
+    def evaluate_hierarchical_match(actual, expected, sample)
+      return false if actual.nil? || expected.nil?
+      return true if actual == expected
+
+      # Check if actual matches parent of expected category
+      categories = sample.categories_context
+
+      # Find the expected category
+      expected_cat = categories.find { |c| c["name"] == expected }
+      return false unless expected_cat
+
+      # If expected has a parent, check if actual matches the parent
+      if expected_cat["parent_id"]
+        parent = categories.find { |c| c["id"].to_s == expected_cat["parent_id"].to_s }
+        return parent && parent["name"] == actual
+      end
+
+      # Also check if actual is a subcategory of expected (reverse direction)
+      actual_cat = categories.find { |c| c["name"] == actual }
+      return false unless actual_cat
+
+      if actual_cat["parent_id"]
+        parent = categories.find { |c| c["id"].to_s == actual_cat["parent_id"].to_s }
+        return parent && parent["name"] == expected
+      end
+
+      false
+    end
+end
--- a/app/models/eval/runners/chat_runner.rb
+++ b/app/models/eval/runners/chat_runner.rb
@@ -0,0 +1,255 @@
+class Eval::Runners::ChatRunner < Eval::Runners::Base
+  # Chat samples are processed one at a time (not batched)
+  # because each has unique context and function calling requirements
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      log_progress("Processing #{all_samples.size} chat samples")
+
+      all_samples.each_with_index do |sample, idx|
+        log_progress("Processing sample #{idx + 1}/#{all_samples.size}")
+        process_sample(sample)
+      end
+    end
+
+    def calculate_metrics
+      Eval::Metrics::ChatMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_sample(sample)
+      prompt = sample.chat_prompt
+      start_time = Time.current
+
+      begin
+        response = provider.chat_response(
+          prompt,
+          model: model,
+          instructions: build_instructions,
+          functions: build_function_definitions
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+
+        if response.success?
+          record_chat_result(sample, response.data, latency_ms)
+        else
+          record_error_result(sample, response.error, latency_ms)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        record_error_result(sample, e, latency_ms)
+      end
+    end
+
+    def record_chat_result(sample, chat_response, latency_ms)
+      # Extract function calls from response
+      actual_functions = extract_functions_from_response(chat_response)
+
+      # Extract response text
+      response_text = extract_response_text(chat_response)
+
+      # Evaluate function calling accuracy
+      expected_functions = sample.expected_functions
+      function_match = evaluate_function_match(actual_functions, expected_functions)
+
+      # Evaluate response content
+      expected_keywords = sample.expected_response_contains
+      response_match = evaluate_response_contains(response_text, expected_keywords)
+
+      # Overall correctness: functions are correct AND response contains expected keywords
+      correct = function_match[:correct] && response_match
+
+      record_result(
+        sample: sample,
+        actual_output: {
+          "functions" => actual_functions,
+          "response_text" => response_text,
+          "function_match_details" => function_match
+        },
+        correct: correct,
+        exact_match: function_match[:exact_match],
+        latency_ms: latency_ms,
+        metadata: {
+          "function_selection_correct" => function_match[:selection_correct],
+          "parameter_accuracy" => function_match[:parameter_accuracy],
+          "response_keywords_found" => response_match,
+          "expected_functions" => expected_functions,
+          "expected_keywords" => expected_keywords
+        }
+      )
+    end
+
+    def record_error_result(sample, error, latency_ms)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      record_result(
+        sample: sample,
+        actual_output: { "error" => error_message },
+        correct: false,
+        exact_match: false,
+        latency_ms: latency_ms,
+        metadata: { "error" => error_message }
+      )
+    end
+
+    def extract_functions_from_response(chat_response)
+      # ChatResponse has function_requests array
+      function_requests = chat_response.function_requests || []
+
+      function_requests.map do |req|
+        {
+          "name" => req.function_name,
+          "params" => parse_function_args(req.function_args)
+        }
+      end
+    end
+
+    def parse_function_args(args)
+      return {} if args.nil?
+      return args if args.is_a?(Hash)
+      JSON.parse(args)
+    rescue JSON::ParserError
+      {}
+    end
+
+    def extract_response_text(chat_response)
+      # ChatResponse has messages array with output_text
+      messages = chat_response.messages || []
+      messages.map(&:output_text).compact.join("\n")
+    end
+
+    def evaluate_function_match(actual_functions, expected_functions)
+      return { correct: true, exact_match: true, selection_correct: true, parameter_accuracy: 1.0 } if expected_functions.empty? && actual_functions.empty?
+      return { correct: false, exact_match: false, selection_correct: false, parameter_accuracy: 0.0 } if expected_functions.empty? && actual_functions.any?
+
+      # Check function selection accuracy
+      expected_names = expected_functions.map { |f| normalize_function_name(f["name"]) }.compact
+      actual_names = actual_functions.map { |f| normalize_function_name(f["name"]) }.compact
+
+      selection_correct = expected_names.all? { |name| actual_names.include?(name) }
+
+      # Check parameter accuracy for matched functions
+      param_scores = []
+      expected_functions.each do |expected_func|
+        expected_name = normalize_function_name(expected_func["name"])
+        actual_func = actual_functions.find { |f| normalize_function_name(f["name"]) == expected_name }
+
+        if actual_func
+          param_score = evaluate_parameters(actual_func["params"], expected_func["params"] || {})
+          param_scores << param_score
+        else
+          param_scores << 0.0
+        end
+      end
+
+      parameter_accuracy = param_scores.empty? ? 0.0 : (param_scores.sum / param_scores.size).round(4)
+
+      # Exact match requires same functions with same parameters
+      exact_match = selection_correct && parameter_accuracy == 1.0
+
+      # Correct if all expected functions were called (parameters don't have to be exact)
+      correct = selection_correct
+
+      {
+        correct: correct,
+        exact_match: exact_match,
+        selection_correct: selection_correct,
+        parameter_accuracy: parameter_accuracy
+      }
+    end
+
+    def normalize_function_name(name)
+      return nil if name.nil?
+      # Convert to snake_case and downcase
+      name.to_s.underscore.downcase
+    end
+
+    def evaluate_parameters(actual_params, expected_params)
+      return 1.0 if expected_params.empty?
+      return 0.0 if actual_params.nil?
+
+      actual_params = actual_params.stringify_keys
+      expected_params = expected_params.stringify_keys
+
+      matches = 0
+      total = expected_params.size
+
+      expected_params.each do |key, expected_value|
+        actual_value = actual_params[key]
+
+        if values_match?(actual_value, expected_value)
+          matches += 1
+        end
+      end
+
+      (matches.to_f / total).round(4)
+    end
+
+    def values_match?(actual, expected)
+      return true if actual == expected
+      return true if actual.to_s.downcase == expected.to_s.downcase
+
+      # For arrays, check if all expected values are present
+      if expected.is_a?(Array) && actual.is_a?(Array)
+        expected_normalized = expected.map { |v| v.to_s.downcase }
+        actual_normalized = actual.map { |v| v.to_s.downcase }
+        return expected_normalized.all? { |v| actual_normalized.include?(v) }
+      end
+
+      # For dates, try to parse and compare
+      if expected.to_s =~ /^\d{4}-\d{2}-\d{2}$/
+        begin
+          expected_date = Date.parse(expected.to_s)
+          actual_date = Date.parse(actual.to_s)
+          return expected_date == actual_date
+        rescue
+          # Not valid dates, fall through
+        end
+      end
+
+      false
+    end
+
+    def evaluate_response_contains(response_text, expected_keywords)
+      return true if expected_keywords.empty?
+      return false if response_text.nil? || response_text.empty?
+
+      normalized_response = response_text.downcase
+
+      expected_keywords.all? do |keyword|
+        normalized_response.include?(keyword.to_s.downcase)
+      end
+    end
+
+    def build_instructions
+      # Simple instructions for evaluation - we don't have a real user/family context
+      <<~PROMPT
+      You are a financial assistant helping users understand their financial data.
+      Use the functions available to answer questions about accounts, transactions, and financial statements.
+      Today's date is #{Date.current}.
+    PROMPT
+    end
+
+    def build_function_definitions
+      # Return the function definitions that the chat would normally have
+      [
+        build_function_definition("get_transactions", "Get paginated transactions with optional filters"),
+        build_function_definition("get_accounts", "Get all accounts with balances and historical data"),
+        build_function_definition("get_balance_sheet", "Get current net worth, assets, and liabilities"),
+        build_function_definition("get_income_statement", "Get income and expenses by category for a period")
+      ]
+    end
+
+    def build_function_definition(name, description)
+      {
+        name: name,
+        description: description,
+        params_schema: { type: "object", properties: {}, additionalProperties: true },
+        strict: false
+      }
+    end
+end
--- a/app/models/eval/runners/merchant_detection_runner.rb
+++ b/app/models/eval/runners/merchant_detection_runner.rb
@@ -0,0 +1,199 @@
+class Eval::Runners::MerchantDetectionRunner < Eval::Runners::Base
+  BATCH_SIZE = 25  # Matches Provider::Openai limit
+  FUZZY_MATCH_THRESHOLD = 0.8
+
+  protected
+
+    def process_samples
+      all_samples = samples.to_a
+      log_progress("Processing #{all_samples.size} samples in batches of #{BATCH_SIZE}")
+
+      all_samples.each_slice(BATCH_SIZE).with_index do |batch, batch_idx|
+        log_progress("Processing batch #{batch_idx + 1}/#{(all_samples.size.to_f / BATCH_SIZE).ceil}")
+        process_batch(batch)
+      end
+    end
+
+    def calculate_metrics
+      Eval::Metrics::MerchantDetectionMetrics.new(eval_run).calculate
+    end
+
+  private
+
+    def process_batch(batch_samples)
+      # Build inputs for the provider
+      transactions = batch_samples.map do |sample|
+        sample.to_transaction_input.merge(id: sample.id)
+      end
+
+      # Get merchants from first sample's context (should be shared)
+      # Symbolize keys since Provider::Openai::AutoMerchantDetector expects symbol keys
+      merchants = batch_samples.first.merchants_context.map(&:deep_symbolize_keys)
+
+      start_time = Time.current
+
+      begin
+        response = provider.auto_detect_merchants(
+          transactions: transactions,
+          user_merchants: merchants,
+          model: model
+        )
+
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+
+        if response.success?
+          record_batch_results(batch_samples, response.data, per_sample_latency)
+        else
+          record_batch_errors(batch_samples, response.error, per_sample_latency)
+        end
+      rescue => e
+        latency_ms = ((Time.current - start_time) * 1000).to_i
+        per_sample_latency = latency_ms / batch_samples.size
+        record_batch_errors(batch_samples, e, per_sample_latency)
+      end
+    end
+
+    def record_batch_results(batch_samples, merchants_detected, per_sample_latency)
+      batch_samples.each do |sample|
+        # Find the merchant detection result for this sample
+        detection = merchants_detected.find { |m| m.transaction_id.to_s == sample.id.to_s }
+
+        actual_name = normalize_null(detection&.business_name)
+        actual_url = normalize_null(detection&.business_url)
+
+        expected_name = sample.expected_business_name
+        expected_url = sample.expected_business_url
+
+        # Evaluate correctness
+        name_match = evaluate_name_match(actual_name, expected_name)
+        url_match = evaluate_url_match(actual_url, expected_url)
+        fuzzy_score = calculate_fuzzy_score(actual_name, expected_name)
+
+        # Overall correct if both name and URL match expectations
+        correct = name_match && url_match
+
+        # Exact match requires both to be exactly equal
+        exact_match = actual_name == expected_name && normalize_url(actual_url) == normalize_url(expected_url)
+
+        record_result(
+          sample: sample,
+          actual_output: { "business_name" => actual_name, "business_url" => actual_url },
+          correct: correct,
+          exact_match: exact_match,
+          fuzzy_score: fuzzy_score,
+          null_expected: expected_name.nil? && expected_url.nil?,
+          null_returned: actual_name.nil? && actual_url.nil?,
+          latency_ms: per_sample_latency
+        )
+      end
+    end
+
+    def record_batch_errors(batch_samples, error, per_sample_latency)
+      error_message = error.is_a?(Exception) ? error.message : error.to_s
+
+      batch_samples.each do |sample|
+        record_result(
+          sample: sample,
+          actual_output: { "error" => error_message },
+          correct: false,
+          exact_match: false,
+          fuzzy_score: 0.0,
+          null_expected: sample.expected_business_name.nil?,
+          null_returned: true,
+          latency_ms: per_sample_latency,
+          metadata: { "error" => error_message }
+        )
+      end
+    end
+
+    def normalize_null(value)
+      return nil if value.nil?
+      return nil if value == "null"
+      return nil if value.to_s.strip.empty?
+      value
+    end
+
+    def evaluate_name_match(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = false positive
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = false negative
+      return false if actual.nil? && expected.present?
+      # Use fuzzy matching for name comparison
+      fuzzy_match?(actual, expected)
+    end
+
+    def evaluate_url_match(actual, expected)
+      # Both null = correct
+      return true if actual.nil? && expected.nil?
+      # Expected null but got value = false positive
+      return false if expected.nil? && actual.present?
+      # Expected value but got null = false negative
+      return false if actual.nil? && expected.present?
+      # Normalize and compare URLs
+      normalize_url(actual) == normalize_url(expected)
+    end
+
+    def normalize_url(url)
+      return nil if url.nil?
+      url.to_s.downcase
+         .gsub(/^(https?:\/\/)?(www\.)?/, "")
+         .chomp("/")
+         .strip
+    end
+
+    def fuzzy_match?(actual, expected)
+      return false if actual.nil? || expected.nil?
+      calculate_fuzzy_score(actual, expected) >= FUZZY_MATCH_THRESHOLD
+    end
+
+    def calculate_fuzzy_score(actual, expected)
+      return 1.0 if actual == expected
+      return 0.0 if actual.nil? || expected.nil?
+
+      # Simple Levenshtein distance-based similarity
+      # Normalize strings for comparison
+      a = actual.to_s.downcase.strip
+      b = expected.to_s.downcase.strip
+
+      return 1.0 if a == b
+
+      # Calculate Levenshtein distance
+      distance = levenshtein_distance(a, b)
+      max_length = [ a.length, b.length ].max
+
+      return 0.0 if max_length == 0
+
+      # Convert distance to similarity score (0.0 to 1.0)
+      (1.0 - (distance.to_f / max_length)).round(4)
+    end
+
+    def levenshtein_distance(s1, s2)
+      m = s1.length
+      n = s2.length
+
+      return m if n == 0
+      return n if m == 0
+
+      # Create distance matrix
+      d = Array.new(m + 1) { Array.new(n + 1) }
+
+      (0..m).each { |i| d[i][0] = i }
+      (0..n).each { |j| d[0][j] = j }
+
+      (1..n).each do |j|
+        (1..m).each do |i|
+          cost = s1[i - 1] == s2[j - 1] ? 0 : 1
+          d[i][j] = [
+            d[i - 1][j] + 1,      # deletion
+            d[i][j - 1] + 1,      # insertion
+            d[i - 1][j - 1] + cost # substitution
+          ].min
+        end
+      end
+
+      d[m][n]
+    end
+end