Temporal Analysis
@id advanced_temporal
Analyze how word associations change over time periods.
Overview
Temporal analysis tracks the evolution of collocations across time, revealing trends, emerging terminology, and changing language patterns.
Basic Temporal Analysis
using TextAssociations, Dates, DataFrames
# Create corpus with temporal metadata
texts = [
("Early computers used vacuum tubes.", Date(1950)),
("Transistors replaced vacuum tubes.", Date(1960)),
("Integrated circuits revolutionized computing.", Date(1970)),
("Microprocessors enabled personal computers.", Date(1980)),
("Internet connected computers globally.", Date(1990)),
("Cloud computing emerged as dominant paradigm.", Date(2000)),
("AI and machine learning transform computing.", Date(2010)),
("Quantum computing shows promise.", Date(2020))
]
# Create corpus with temporal metadata
df = DataFrame(
text = [t[1] for t in texts],
year = [year(t[2]) for t in texts]
)
corpus = read_corpus_df(df;
text_column=:text,
metadata_columns=[:year]
)
# Analyze temporal trends
temporal = analyze_temporal(
corpus,
["computing", "computers"],
:year,
PMI;
time_bins=4,
windowsize=5,
minfreq=1
)
println("Temporal Analysis Results:")
println("Time periods analyzed: ", temporal.time_periods)
if !isempty(temporal.trend_analysis)
println("\nTop trending associations:")
trending = first(sort(temporal.trend_analysis, :Correlation, rev=true), 5)
for row in eachrow(trending)
println(" $(row.Node) + $(row.Collocate): r=$(round(row.Correlation, digits=2))")
end
end
Processing DataFrame... 25%|███████ | ETA: 0:00:00
Processing DataFrame... 100%|████████████████████████████| Time: 0:00:00
Temporal Analysis Results:
Time periods analyzed: ["1950.0-1968.0", "1968.0-1985.0", "1985.0-2002.0", "2002.0-2020.0"]
Trend Detection
Identifying Emerging Terms
using TextAssociations, Statistics
function identify_emerging_terms(temporal_analysis::TemporalCorpusAnalysis,
threshold::Float64=0.5)
trends = temporal_analysis.trend_analysis
# Filter for positive trends
emerging = filter(row -> row.Correlation > threshold, trends)
# Sort by slope (rate of change)
sort!(emerging, :Slope, rev=true)
println("Emerging Terms (correlation > $threshold):")
for row in eachrow(first(emerging, 10))
trend = row.Slope > 0 ? "↑" : "↓"
println(" $(row.Node) + $(row.Collocate): $trend slope=$(round(row.Slope, digits=3))")
end
return emerging
end
# Apply to our temporal analysis
# emerging_terms = identify_emerging_terms(temporal)
println("\nNote: Full trend detection requires more data points")
Note: Full trend detection requires more data points
Detecting Declining Associations
using TextAssociations
function identify_declining_terms(temporal_analysis::TemporalCorpusAnalysis)
trends = temporal_analysis.trend_analysis
# Filter for negative trends
declining = filter(row -> row.Correlation < -0.3, trends)
println("Declining Associations:")
if !isempty(declining)
for row in eachrow(declining)
println(" $(row.Node) + $(row.Collocate): correlation=$(round(row.Correlation, digits=2))")
end
else
println(" No strongly declining associations found")
end
return declining
end
# Apply to our analysis
# declining = identify_declining_terms(temporal)
identify_declining_terms (generic function with 1 method)
Period Comparison
Cross-Period Analysis
using TextAssociations
function compare_periods(temporal_analysis::TemporalCorpusAnalysis,
period1::String, period2::String)
results1 = temporal_analysis.results_by_period[period1]
results2 = temporal_analysis.results_by_period[period2]
# Get all nodes
nodes = union(results1.nodes, results2.nodes)
comparison = DataFrame()
for node in nodes
if haskey(results1.results, node) && haskey(results2.results, node)
df1 = results1.results[node]
df2 = results2.results[node]
# Find common collocates
common = intersect(df1.Collocate, df2.Collocate)
for collocate in common
idx1 = findfirst(==(collocate), df1.Collocate)
idx2 = findfirst(==(collocate), df2.Collocate)
if idx1 !== nothing && idx2 !== nothing
# Assume first metric column after standard columns
metric_col = names(df1)[findfirst(n -> n ∉ [:Node, :Collocate, :Frequency, :DocFrequency], names(df1))]
score1 = df1[idx1, metric_col]
score2 = df2[idx2, metric_col]
push!(comparison, (
Node = node,
Collocate = collocate,
Period1_Score = score1,
Period2_Score = score2,
Change = score2 - score1,
PercentChange = (score2 - score1) / abs(score1) * 100
))
end
end
end
end
return comparison
end
# Example comparison
if length(temporal.time_periods) >= 2
period1 = temporal.time_periods[1]
period2 = temporal.time_periods[end]
println("\nComparing $period1 vs $period2:")
# comparison = compare_periods(temporal, period1, period2)
end
Comparing 1950.0-1968.0 vs 2002.0-2020.0:
Visualization Preparation
Time Series Data
using TextAssociations, DataFrames
function prepare_timeseries_data(temporal_analysis::TemporalCorpusAnalysis,
node::String, collocate::Symbol)
periods = String[]
scores = Float64[]
for period in sort(temporal_analysis.time_periods)
if haskey(temporal_analysis.results_by_period, period)
results = temporal_analysis.results_by_period[period]
if haskey(results.results, node)
df = results.results[node]
idx = findfirst(==(collocate), df.Collocate)
if idx !== nothing
# Find metric column
metric_cols = filter(n -> n ∉ [:Node, :Collocate, :Frequency, :DocFrequency], names(df))
if !isempty(metric_cols)
push!(periods, period)
push!(scores, df[idx, metric_cols[1]])
end
end
end
end
end
return DataFrame(Period=periods, Score=scores)
end
# Prepare data for plotting
# timeseries = prepare_timeseries_data(temporal, "computing", :ai)
println("\nTime series data structure prepared for visualization")
Time series data structure prepared for visualization
Advanced Temporal Patterns
Burst Detection
using TextAssociations, Statistics
function detect_bursts(temporal_analysis::TemporalCorpusAnalysis,
z_threshold::Float64=2.0)
bursts = DataFrame()
for (node, node_results) in temporal_analysis.results_by_period[1].results
# Track each collocate over time
collocate_scores = Dict{Symbol, Vector{Float64}}()
for period in temporal_analysis.time_periods
if haskey(temporal_analysis.results_by_period[period].results, node)
period_df = temporal_analysis.results_by_period[period].results[node]
for row in eachrow(period_df)
if !haskey(collocate_scores, row.Collocate)
collocate_scores[row.Collocate] = Float64[]
end
# Get first metric score
metric_cols = filter(n -> n ∉ [:Node, :Collocate, :Frequency, :DocFrequency], names(period_df))
if !isempty(metric_cols)
push!(collocate_scores[row.Collocate], row[metric_cols[1]])
end
end
end
end
# Detect bursts using z-scores
for (collocate, scores) in collocate_scores
if length(scores) > 2
μ = mean(scores)
σ = std(scores)
if σ > 0
z_scores = (scores .- μ) ./ σ
max_z = maximum(z_scores)
if max_z > z_threshold
burst_period = temporal_analysis.time_periods[argmax(z_scores)]
push!(bursts, (
Node = node,
Collocate = collocate,
BurstPeriod = burst_period,
ZScore = max_z
))
end
end
end
end
end
if !isempty(bursts)
sort!(bursts, :ZScore, rev=true)
println("Detected Bursts (z > $z_threshold):")
for row in eachrow(first(bursts, min(5, nrow(bursts))))
println(" $(row.Node) + $(row.Collocate) in $(row.BurstPeriod): z=$(round(row.ZScore, digits=2))")
end
else
println("No significant bursts detected")
end
return bursts
end
# Detect bursts in our data
# bursts = detect_bursts(temporal)
detect_bursts (generic function with 2 methods)
Semantic Shift Detection
using TextAssociations
function detect_semantic_shift(temporal_analysis::TemporalCorpusAnalysis,
node::String, threshold::Float64=0.5)
periods = temporal_analysis.time_periods
if length(periods) < 2
println("Need at least 2 periods for semantic shift detection")
return DataFrame()
end
# Compare first and last periods
first_period = periods[1]
last_period = periods[end]
shifts = DataFrame()
if haskey(temporal_analysis.results_by_period[first_period].results, node) &&
haskey(temporal_analysis.results_by_period[last_period].results, node)
first_df = temporal_analysis.results_by_period[first_period].results[node]
last_df = temporal_analysis.results_by_period[last_period].results[node]
# Find collocates unique to each period
early_only = setdiff(first_df.Collocate, last_df.Collocate)
late_only = setdiff(last_df.Collocate, first_df.Collocate)
println("Semantic shift for '$node':")
println(" Lost associations ($(first_period)): ", first(early_only, 5))
println(" New associations ($(last_period)): ", first(late_only, 5))
# Calculate shift magnitude
all_collocates = union(first_df.Collocate, last_df.Collocate)
overlap = intersect(first_df.Collocate, last_df.Collocate)
jaccard = length(overlap) / length(all_collocates)
shift_magnitude = 1 - jaccard
println(" Semantic shift magnitude: $(round(shift_magnitude, digits=2))")
end
return shifts
end
# Analyze semantic shift
# detect_semantic_shift(temporal, "computing")
detect_semantic_shift (generic function with 2 methods)
Best Practices
1. Time Bin Selection
# Guidelines for time bins
function optimal_time_bins(corpus_size::Int, time_span::Int)
if time_span < 10
return 2:3 # Few bins for short spans
elseif time_span < 50
return 5:10 # Moderate bins
else
return 10:20 # More bins for long spans
end
end
2. Minimum Data Requirements
# Ensure sufficient data per period
const MIN_DOCS_PER_PERIOD = 10
const MIN_TOKENS_PER_PERIOD = 1000
const MIN_NODE_FREQ_PER_PERIOD = 5
3. Trend Validation
# Validate trends with multiple metrics
function validate_trend(temporal_analysis, node, collocate)
# Check consistency across metrics
# Require minimum correlation strength
# Verify sufficient data points
end
Next Steps
- Explore Network Analysis for visualizing temporal changes
- See Keyword Extraction for period-specific keywords
- Review Performance Guide for large temporal corpora