diff --git a/Gemfile b/Gemfile index ffd651e67..48c39c836 100644 --- a/Gemfile +++ b/Gemfile @@ -68,6 +68,7 @@ gem "pagy" gem "rails-settings-cached" gem "tzinfo-data", platforms: %i[windows jruby] gem "csv" +gem "rchardet" # Character encoding detection gem "redcarpet" gem "stripe" gem "plaid" diff --git a/Gemfile.lock b/Gemfile.lock index 86435af42..39940bbd9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -528,6 +528,7 @@ GEM ffi (~> 1.0) rbs (3.9.4) logger + rchardet (1.10.0) rdoc (6.14.2) erb psych (>= 4.0.0) @@ -802,6 +803,7 @@ DEPENDENCIES rack-mini-profiler rails (~> 7.2.2) rails-settings-cached + rchardet redcarpet redis (~> 5.4) rotp (~> 6.3) diff --git a/app/models/import.rb b/app/models/import.rb index 05b426573..1d288c480 100644 --- a/app/models/import.rb +++ b/app/models/import.rb @@ -22,6 +22,7 @@ class Import < ApplicationRecord belongs_to :account, optional: true before_validation :set_default_number_format + before_validation :ensure_utf8_encoding scope :ordered, -> { order(created_at: :desc) } @@ -294,6 +295,68 @@ class Import < ApplicationRecord self.number_format ||= "1,234.56" # Default to US/UK format end + # Common encodings to try when UTF-8 detection fails + # Windows-1250 is prioritized for Central/Eastern European languages + COMMON_ENCODINGS = [ "Windows-1250", "Windows-1252", "ISO-8859-1", "ISO-8859-2" ].freeze + + def ensure_utf8_encoding + # Handle nil or empty string first (before checking if changed) + return if raw_file_str.nil? || raw_file_str.bytesize == 0 + + # Only process if the attribute was changed + # Use will_save_change_to_attribute? which is safer for binary data + return unless will_save_change_to_raw_file_str? + + # If already valid UTF-8, nothing to do + begin + if raw_file_str.encoding == Encoding::UTF_8 && raw_file_str.valid_encoding? + return + end + rescue ArgumentError + # raw_file_str might have invalid encoding, continue to detection + end + + # Detect encoding using rchardet + begin + require "rchardet" + detection = CharDet.detect(raw_file_str) + detected_encoding = detection["encoding"] + confidence = detection["confidence"] + + # Only convert if we have reasonable confidence in the detection + if detected_encoding && confidence > 0.75 + # Force encoding and convert to UTF-8 + self.raw_file_str = raw_file_str.force_encoding(detected_encoding).encode("UTF-8", invalid: :replace, undef: :replace) + else + # Fallback: try common encodings + try_common_encodings + end + rescue LoadError + # rchardet not available, fallback to trying common encodings + try_common_encodings + rescue ArgumentError, Encoding::CompatibilityError => e + # Handle encoding errors by falling back to common encodings + try_common_encodings + end + end + + def try_common_encodings + COMMON_ENCODINGS.each do |encoding| + begin + test = raw_file_str.dup.force_encoding(encoding) + if test.valid_encoding? + self.raw_file_str = test.encode("UTF-8", invalid: :replace, undef: :replace) + return + end + rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError + next + end + end + + # If nothing worked, force UTF-8 and replace invalid bytes + self.raw_file_str = raw_file_str.force_encoding("UTF-8").scrub("?") + end + def account_belongs_to_family return if account.nil? return if account.family_id == family_id diff --git a/test/fixtures/files/imports/windows1250.csv b/test/fixtures/files/imports/windows1250.csv new file mode 100644 index 000000000..def36e8e3 --- /dev/null +++ b/test/fixtures/files/imports/windows1250.csv @@ -0,0 +1,4 @@ +Date,Name,Amount,Category,Tags,Account,Notes +2024-01-01,Sklep spożywczy,-50.00,Żywność,Zakupy,Konto główne,Zakupy w sklepie +2024-01-02,Kawiarnia Café,-15.50,Jedzenie,Kawa|Śniadanie,Karta kredytowa,Poranna kawa +2024-01-03,Stacja benzynowa,-120.00,Transport,Paliwo|Samochód,Konto główne,Tankowanie auta diff --git a/test/models/import_encoding_test.rb b/test/models/import_encoding_test.rb new file mode 100644 index 000000000..0052e8dfc --- /dev/null +++ b/test/models/import_encoding_test.rb @@ -0,0 +1,74 @@ +require "test_helper" + +class ImportEncodingTest < ActiveSupport::TestCase + setup do + @family = families(:dylan_family) + @account = accounts(:depository) + end + + test "successfully imports Windows-1250 encoded CSV" do + # Test that Windows-1250 encoded files are properly converted to UTF-8 + file_path = Rails.root.join("test/fixtures/files/imports/windows1250.csv") + csv_content = File.binread(file_path) + + # Verify the file is not UTF-8 + assert_equal Encoding::ASCII_8BIT, csv_content.encoding + refute csv_content.force_encoding("UTF-8").valid_encoding?, "Test file should not be valid UTF-8" + + import = @family.imports.create!( + type: "TransactionImport", + account: @account, + date_format: "%Y-%m-%d", + date_col_label: "Date", + amount_col_label: "Amount", + name_col_label: "Name", + category_col_label: "Category", + tags_col_label: "Tags", + account_col_label: "Account", + notes_col_label: "Notes", + signage_convention: "inflows_negative", + amount_type_strategy: "signed_amount" + ) + + # With encoding detection, the import should succeed + assert_nothing_raised do + import.update!(raw_file_str: csv_content) + end + + # Verify the raw_file_str was converted to UTF-8 + assert_equal Encoding::UTF_8, import.raw_file_str.encoding + assert import.raw_file_str.valid_encoding?, "Converted string should be valid UTF-8" + + # Verify we can generate rows from the CSV + assert_nothing_raised do + import.generate_rows_from_csv + end + + # Verify that rows were created + import.reload + assert import.rows_count > 0, "Expected rows to be created from Windows-1250 CSV" + assert_equal 3, import.rows_count, "Expected 3 data rows" + + # Verify Polish characters were preserved correctly + first_row = import.rows.first + assert_not_nil first_row, "Expected first row to exist" + assert_includes first_row.name, "spoĹĽywczy", "Polish characters should be preserved" + end + + test "handles UTF-8 files without modification" do + # Test that valid UTF-8 files are not modified + file_path = Rails.root.join("test/fixtures/files/imports/transactions.csv") + csv_content = File.read(file_path, encoding: "UTF-8") + + import = @family.imports.create!( + type: "TransactionImport", + account: @account, + date_format: "%Y-%m-%d", + raw_file_str: csv_content + ) + + # UTF-8 content should remain unchanged + assert_equal Encoding::UTF_8, import.raw_file_str.encoding + assert import.raw_file_str.valid_encoding? + end +end