Text Preprocessing

Effective text preprocessing is crucial for accurate word association analysis. This guide covers all preprocessing options and best practices.

The TextNorm Configuration

TextAssociations.jl uses the TextNorm struct to control all preprocessing:

using TextAssociations

# Default configuration
default_config = TextNorm()

# Custom configuration
custom_config = TextNorm(
    strip_case=true,           # Lowercase text
    strip_accents=false,       # Keep diacritics
    unicode_form=:NFC,         # Unicode normalization
    strip_punctuation=true,    # Remove punctuation
    punctuation_to_space=true, # Replace punct with space
    normalize_whitespace=true, # Collapse multiple spaces
    strip_whitespace=false,    # Don't remove all spaces
    use_prepare=false         # Don't use TextAnalysis pipeline
)

println("Default settings:")
for field in fieldnames(TextNorm)
    println("  $field: $(getfield(default_config, field))")
end
Default settings:
  strip_case: true
  strip_accents: false
  unicode_form: NFC
  strip_punctuation: true
  punctuation_to_space: true
  normalize_whitespace: true
  strip_whitespace: false
  use_prepare: false

Preprocessing Options

Case Normalization

using TextAssociations
using DataFrames

s = "The IBM CEO visited NASA headquarters."

# Keep original case
ct_case = ContingencyTable(s, "IBM";
    windowsize=5,
    minfreq=1,
    norm_config=TextNorm(strip_case=false))
results_case = assoc_score(PMI, ct_case)

# Normalize to lowercase
ct_lower = ContingencyTable(s, "IBM";
    windowsize=5,
    minfreq=1,
    norm_config=TextNorm(strip_case=true))
results_lower = assoc_score(PMI, ct_lower)

println("With case preservation: $(nrow(results_case)) collocates")
println("With lowercasing: $(nrow(results_lower)) collocates")
With case preservation: 5 collocates
With lowercasing: 5 collocates

Punctuation Handling

using TextAssociations
using TextAnalysis: text

text_punct = "Well-designed, user-friendly interface; however, performance issues..."

# Remove punctuation
config_remove = TextNorm(strip_punctuation=true, punctuation_to_space=false)
removed = prep_string(text_punct, config_remove)
println("Removed: '$(text(removed))'")

# Replace with spaces
config_space = TextNorm(strip_punctuation=true, punctuation_to_space=true)
spaced = prep_string(text_punct, config_space)
println("To space: '$(text(spaced))'")

# Keep punctuation
config_keep = TextNorm(strip_punctuation=false)
kept = prep_string(text_punct, config_keep)
println("Kept: '$(text(kept))'")
Removed: 'welldesigned userfriendly interface however performance issues'
To space: 'well designed user friendly interface however performance issues '
Kept: 'well-designed, user-friendly interface; however, performance issues...'

Whitespace Normalization

using TextAssociations
using TextAnalysis: text

text1 = "Multiple   spaces    and\t\ttabs\n\neverywhere"

# Normalize whitespace
normalized = prep_string(text1, TextNorm(normalize_whitespace=true))
println("Normalized: '$(text(normalized))'")

# Strip all whitespace (for certain languages)
stripped = prep_string(text1, TextNorm(strip_whitespace=true))
println("Stripped: '$(text(stripped))'")
Normalized: 'multiple spaces and tabs everywhere'
Stripped: 'multiple spaces and tabs everywhere'

Accent Stripping

Critical for multilingual analysis:

using TextAssociations
using TextAnalysis: text

# Greek text with tonos marks
greek = "Η ανάλυση κειμένου είναι σημαντική"

# French text with accents
french = "L'analyse détaillée révèle des résultats intéressants"

# Spanish text
spanish = "El análisis lingüístico computacional avanzó rápidamente"

function compare_accent_handling(s::String, lang::String)
    println("\n$lang text:")

    # With accents
    with_config = TextNorm(strip_accents=false)
    with_doc = prep_string(s, with_config)
    println("  With accents: '$(text(with_doc))'")

    # Without accents
    without_config = TextNorm(strip_accents=true)
    without_doc = prep_string(s, without_config)
    println("  Without accents: '$(text(without_doc))'")
end

compare_accent_handling(greek, "Greek")
compare_accent_handling(french, "French")
compare_accent_handling(spanish, "Spanish")

Greek text:
  With accents: 'η ανάλυση κειμένου είναι σημαντική'
  Without accents: 'η αναλυση κειμενου ειναι σημαντικη'

French text:
  With accents: 'l analyse détaillée révèle des résultats intéressants'
  Without accents: 'l analyse detaillee revele des resultats interessants'

Spanish text:
  With accents: 'el análisis lingüístico computacional avanzó rápidamente'
  Without accents: 'el analisis linguistico computacional avanzo rapidamente'

Unicode Normalization

Understanding Unicode Forms

using TextAssociations, Unicode

# Same character in different forms
text_nfc = "café"  # NFC: é as single character
text_nfd = Unicode.normalize("café", :NFD)  # NFD: e + combining accent

println("Visual: both look like 'café'")
println("NFC length: $(length(text_nfc))")
println("NFD length: $(length(text_nfd))")
println("Equal? ", text_nfc == text_nfd)

# TextNorm handles this automatically
config_nfc = TextNorm(unicode_form=:NFC)
config_nfd = TextNorm(unicode_form=:NFD)
TextNorm(true, false, :NFD, true, true, true, false, false)

Choosing Unicode Forms

FormUse CaseExample
:NFCDefault, most compactGeneral text
:NFDAccent strippingMultilingual processing
:NFKCNormalize variantsSocial media text
:NFKDMaximum decompositionSpecial characters

Language-Specific Preprocessing

Greek Text

using TextAssociations
using DataFrames: eachrow
using TextAnalysis: text

greek_text = """
Η φιλοσοφία και η επιστήμη συνδέονται στενά.
Οι Έλληνες φιλόσοφοι επηρέασαν τη σκέψη.
"""

# Greek-specific configuration
greek_config = TextNorm(
    strip_case=true,      # Greek has case
    strip_accents=true,   # Remove tonos/dialytika
    unicode_form=:NFD,    # Better accent stripping
    strip_punctuation=true
)

ct = ContingencyTable(greek_text, "φιλοσοφία"; windowsize=5, minfreq=1,
    norm_config=greek_config)
results = assoc_score(PMI, ct)

println("Greek collocations (normalized):")
for row in eachrow(results)
    println("  $(row.Collocate): PMI=$(round(row.PMI, digits=2))")
end
Greek collocations (normalized):
  επιστημη: PMI=0.0
  η: PMI=-0.69
  και: PMI=0.0
  στενα: PMI=0.0
  συνδεονται: PMI=0.0

Chinese/Japanese Text

using TextAssociations

# Chinese text (no spaces between words)
chinese = "机器学习是人工智能的重要组成部分"

# Japanese (mixed scripts)
japanese = "機械学習はAIの重要な分野です"

# CJK-specific configuration
cjk_config = TextNorm(
    strip_case=false,     # No case in CJK
    strip_accents=false,  # No accents
    strip_whitespace=true, # Remove spaces
    strip_punctuation=true
)

# Note: Proper CJK processing would require word segmentation
TextNorm(false, false, :NFC, true, true, true, true, false)

Arabic Text

using TextAssociations

arabic = "التعلم الآلي يحول البيانات إلى معرفة"

# Arabic-specific configuration
arabic_config = TextNorm(
    strip_case=false,      # No case in Arabic
    strip_accents=false,   # Keep diacritics
    unicode_form=:NFC,     # Standard form
    strip_punctuation=true,
    normalize_whitespace=true
)

# Right-to-left text handling is automatic in Julia
TextNorm(false, false, :NFC, true, true, true, false, false)

Advanced Preprocessing

Custom Preprocessing Pipeline

using TextAssociations
using TextAnalysis: text

function custom_preprocess(s::String)
    # Step 1: Remove URLs
    s = replace(s, r"https?://[^\s]+" => "[URL]")

    # Step 2: Remove email addresses
    s = replace(s, r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" => "[EMAIL]")

    # Step 3: Expand contractions
    contractions = Dict(
        "don't" => "do not",
        "won't" => "will not",
        "can't" => "cannot",
        "n't" => " not",
        "'re" => " are",
        "'ve" => " have",
        "'ll" => " will",
        "'d" => " would",
        "'m" => " am"
    )

    for (contraction, expansion) in contractions
        s = replace(s, contraction => expansion)
    end

    # Step 4: Standard normalization
    config = TextNorm(
        strip_case=true,
        strip_punctuation=true,
        normalize_whitespace=true
    )

    doc = prep_string(s, config)
    return text(doc)
end

# Test custom preprocessing
test_text = "Don't forget to check https://example.com and email me at user@example.com"
processed = custom_preprocess(test_text)
println("Original: $test_text")
println("Processed: $processed")
Original: Don't forget to check https://example.com and email me at user@example.com
Processed: do not forget to check url and email me at email

Handling Special Characters

using TextAssociations
using TextAnalysis: text

text_with_special = "Price: \$99.99 | Temperature: 25°C | Math: x² + y² = r²"

# Different strategies for special characters
configs = [
    ("Keep symbols", TextNorm(strip_punctuation=false)),
    ("Remove symbols", TextNorm(strip_punctuation=true)),
    ("Normalize", TextNorm(unicode_form=:NFKC))  # Converts ² to 2
]

for (name, config) in configs
    doc = prep_string(text_with_special, config)
    println("$name: '$(text(doc))'")
end
Keep symbols: 'price: $99.99 | temperature: 25°c | math: x² + y² = r²'
Remove symbols: 'price $99 99 | temperature 25°c | math x² + y² = r²'
Normalize: 'price $99 99 | temperature 25°c | math x2 + y2 = r2'

Performance Considerations

Preprocessing Impact on Speed

using TextAssociations
using BenchmarkTools

s = repeat("Sample text for benchmarking. ", 1000)

# Minimal preprocessing
minimal = TextNorm(
    strip_case=false,
    strip_punctuation=false,
    normalize_whitespace=false
)

# Standard preprocessing
standard = TextNorm()

# Heavy preprocessing
heavy = TextNorm(
    strip_case=true,
    strip_accents=true,
    strip_punctuation=true,
    normalize_whitespace=true
)

# Compare performance
configs = [
    ("Minimal", minimal),
    ("Standard", standard),
    ("Heavy", heavy)
]

for (name, config) in configs
    time = @elapsed prep_string(s, config)
    println("$name: $(round(time*1000, digits=2))ms")
end

Memory Usage

using TextAssociations

# Memory-efficient preprocessing for large texts
function stream_preprocess(file_path::String, chunk_size::Int=1024*1024)
    config = TextNorm()

    open(file_path, "r") do io
        while !eof(io)
            chunk = read(io, chunk_size)
            chunk_text = String(chunk)

            # Process chunk
            doc = prep_string(chunk_text, config)

            # Yield processed chunk (in practice, write to output)
            # process_chunk(text(doc))
        end
    end
end

println("Stream processing implemented for large files")
Stream processing implemented for large files

Validation and Testing

Preprocessing Verification

using TextAssociations
using TextAnalysis: text

function verify_preprocessing(original::String, config::TextNorm)
    processed = prep_string(original, config)
    processed_text = text(processed)

    println("Original: '$original'")
    println("Processed: '$processed_text'")
    println("Changes:")

    # Check case changes
    if config.strip_case && original != lowercase(original)
        println("  ✓ Case normalized")
    end

    # Check punctuation removal
    if config.strip_punctuation && occursin(r"[[:punct:]]", original)
        if !occursin(r"[[:punct:]]", processed_text)
            println("  ✓ Punctuation removed")
        end
    end

    # Check whitespace normalization
    if config.normalize_whitespace && occursin(r"\s{2,}", original)
        if !occursin(r"\s{2,}", processed_text)
            println("  ✓ Whitespace normalized")
        end
    end

    return processed_text
end

test = "HELLO,  World!!!   Multiple   spaces..."
config = TextNorm()
verify_preprocessing(test, config)
"hello world multiple spaces "

Best Practices

1. Choose Appropriate Settings

# Research/academic text
ACADEMIC_CONFIG = TextNorm(
    strip_case=true,
    strip_punctuation=true,
    normalize_whitespace=true,
    strip_accents=false  # Preserve author names
)

# Social media text
SOCIAL_CONFIG = TextNorm(
    strip_case=true,
    strip_punctuation=false,  # Keep hashtags, mentions
    normalize_whitespace=true,
    unicode_form=:NFKC  # Normalize variants
)

# Multilingual text
MULTILINGUAL_CONFIG = TextNorm(
    strip_case=true,
    strip_accents=true,  # For cross-language matching
    unicode_form=:NFD,
    normalize_whitespace=true
)

2. Document Your Choices

Always document preprocessing decisions:

# Save configuration with results
function save_analysis_with_config(results::DataFrame, config::TextNorm, file::String)
    # Add preprocessing metadata
    metadata!(results, "preprocessing", config, style=:note)

    # Save to file
    CSV.write(file, results)

    # Save config separately
    config_file = replace(file, ".csv" => "_config.txt")
    open(config_file, "w") do io
        for field in fieldnames(TextNorm)
            println(io, "$field: $(getfield(config, field))")
        end
    end
end

3. Test Edge Cases

# Test suite for preprocessing
test_cases = [
    "normal text",
    "UPPERCASE TEXT",
    "MiXeD cAsE",
    "text-with-hyphens",
    "text_with_underscores",
    "email@example.com",
    "https://example.com",
    "café résumé naïve",
    "emoji 😀 text 🎉",
    "   extra    spaces   ",
    "text\twith\ttabs",
    "multi\nline\ntext"
]

function test_preprocessing(config::TextNorm)
    for test in test_cases
        processed = text(prep_string(test, config))
        println("'$test' → '$processed'")
    end
end

Troubleshooting

Common Issues

IssueCauseSolution
Words not matchingDifferent Unicode formsUse consistent unicode_form
Missing collocationsOver-aggressive preprocessingReduce stripping options
Too much noiseInsufficient preprocessingEnable more normalization
Accent issuesInconsistent accent handlingSet strip_accents consistently

Debug Helper

using TextAssociations: TextNorm, prep_string, normalize_node
using TextAnalysis: text

function debug_preprocessing(s::String, word::String, config::TextNorm)
    # Show original
    println("Original text: '$s'")
    println("Looking for: '$word'")

    # Process text
    processed = prep_string(s, config)
    processed_text = text(processed)
    println("\nProcessed text: '$processed_text'")

    # Normalize word
    normalized_word = normalize_node(word, config)
    println("Normalized word: '$normalized_word'")

    # Check if word exists
    tokens = split(lowercase(processed_text))
    found = normalized_word in tokens
    println("\nWord found: $found")

    if !found
        prefix_len = min(3, length(normalized_word))
        # Find similar words
        similar = filter(t -> startswith(t, normalized_word[1:prefix_len]), tokens)
        if !isempty(similar)
            println("Similar words: ", similar)
        end
    end
end

debug_preprocessing("Café serves coffee", "cafe", TextNorm(strip_accents=false))
Original text: 'Café serves coffee'
Looking for: 'cafe'

Processed text: 'café serves coffee'
Normalized word: 'cafe'

Word found: false
Similar words: SubString{String}["café"]

Next Steps