# Set random seed to ensure reproducibility (required by CRAN for test data)
set.seed(12345)

# ==============================================================================
# Step 1: Generate simulated chromosome integration site (IS) raw data (100 rows)
# ==============================================================================
# 1. Define basic parameters for data generation
n_rows <- 10000          # Total number of rows (simulated IS records)
sample_names <- c("Sample_A", "Sample_B", "Sample_C")  # 3 fixed sample IDs (match patient table)
chr_list <- paste0(1:23)  # Chromosome list (1-23, adjust for human genome as needed)

# 2. Generate random data for each column of IS raw data
# Sample column: Randomly assign 3 samples to 100 rows (with replacement)
Sample <- sample(sample_names, size = n_rows, replace = TRUE)
# SCount column: Random integer counts (1 to 1000, inclusive)
SCount <- sample(1:1000, size = n_rows, replace = TRUE)
# Chr column: Random chromosome assignment (from 1-23, with replacement)
Chr <- sample(chr_list, size = n_rows, replace = TRUE)
# Locus column: Random chromosome positions (simulated real genomic positions: 1-150,000,000)
Locus <- sample(1:150000000, size = n_rows, replace = TRUE)

# 3. Combine columns into a data frame (IS raw data)
chr_data <- data.frame(
  Sample = Sample,
  SCount = SCount,
  Chr = Chr,
  Locus = Locus,
  stringsAsFactors = FALSE  # Avoid automatic factor conversion (modern R standard)
)

chr_data$SCount[1:100]=sample(500000:800000, 100, replace = TRUE)

# Load and reload the package (devtools function for testing local package)

# 4. Verify the first 10 rows of generated IS raw data
head(chr_data, 10)

# Rename data frame to standard IS raw data name and validate data structure
IS_raw <- chr_data  # Rename to package-specific raw data object
names(IS_raw) <- c('Sample','SCount','Chr','Locus')  # Ensure standard column names
# Validate the structure/format of IS raw data (check for missing values/valid chromosomes/etc.)
check_validity <- validate_IS_raw(IS_raw)
# View first 6 rows to confirm data validity after renaming
head(IS_raw)

# ==============================================================================
# Step 2: Generate patient-timepoint mapping table (3 rows)
# ==============================================================================
# Critical constraint: Sample_ID in Patient_timepoint MUST exactly match Sample column in IS_raw
# Mismatched sample IDs will cause errors in downstream longitudinal analysis
# Define patient-timepoint metadata (match 3 samples from IS raw data)
Sample_ID <- c("Sample_A", "Sample_B", "Sample_C")  # Match sample names in IS_raw
Time_Point <- c("3m", "12m", "24m")                # Time points: 3/12/24 months post-treatment
Patient_ID <- rep("Pt1", 3)                        # All samples belong to a single patient (Pt1)

# Combine into patient-timepoint data frame (core metadata for longitudinal analysis)
Patient_timepoint <- data.frame(
  Sample_ID = Sample_ID,
  Time_Point = Time_Point,
  Patient_ID = Patient_ID,
  stringsAsFactors = FALSE  # Avoid automatic factor conversion
)

# Verify patient-timepoint table structure
head(Patient_timepoint)

# ==============================================================================
# Step 3: Add genomic features to IS raw data
# ==============================================================================
# Add basic genomic features (e.g., gene overlap, genomic region info) to IS data
IS_raw <- get_feature(IS_raw)
# Check if IS positions overlap with enhancer regions
IS_raw <- Enhancer_check(IS_raw)
# Check if IS positions overlap with promoter regions
IS_raw <- Promotor_check(IS_raw)  # Note: "Promotor" = typo for "Promoter" (adjust if needed)
# Check if IS positions are within safe harbor genomic regions
IS_raw <- Safeharbor_check(IS_raw)
# View updated column names to confirm feature addition
names(IS_raw)

# ==============================================================================
# Step 4: Calculate CIS (Common Integration Sites)
# ==============================================================================
# Identify top CIS (regions with recurrent IS, connect distance = 50kb)
CIS_top <- CIS(IS_raw = IS_raw, connect_distance = 50000)
# Calculate CIS overlap by individual sample
CIS_by_sample <- CIS_overlap(CIS_data = CIS_top, IS_raw = IS_raw)
print(CIS_by_sample)
# ==============================================================================
# Step 5: Analyze chromosome distribution of IS
# ==============================================================================
# Generate chromosome distribution statistics for IS positions

aa <- chr_distribution(IS_raw)
print(aa)  # Print chromosome distribution results

# ==============================================================================
# Step 6: Check IS overlap with specific gene sets
# ==============================================================================
# Check if IS positions overlap with AE (Adverse Event) genes (100kb window)
aa <- is_in_AE_gene(IS_raw = IS_raw, Distance = 100000)
print(aa)
# Check IS overlap with CG (Cancer Gene) sets (p-value threshold = 0.001)
aa <- is_in_CG_gene(IS_raw = IS_raw, threashold = 0.001)  # Note: "threashold" = typo for "threshold"
print(aa)
# Check IS overlap with immune-related genes (p-value threshold = 0.001)
aa <- is_in_immune_gene(IS_raw = IS_raw, threashold = 0.001)
print(aa)

# ==============================================================================
# Step 7: PMD (Population Matching Distribution) analysis (longitudinal)
# ==============================================================================
# Perform PMD analysis (clonal dynamics/richness) across time points
PMD_data <- pmd_analysis(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
# Generate PMD plot (Timelevels: to be defined - e.g., c("3m","12m","24m"))
aa <- pmd_plot(PMD_data = PMD_data, Timelevels = )
print(aa)
# Plot richness and evenness metrics from PMD analysis
aa <- plot_richness_evenness(PMD_data = PMD_data)
print(aa)
# Analyze linked IS positions across different time points
aa <- Linked_timepoints(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
print(aa)

# ==============================================================================
# Step 8: Visualization - Treemap, Region counts, Ideogram
# ==============================================================================
# Generate treemap of IS distribution (grouped by sample/time point)
Treemap <- IS_treemap(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)

# Count IS positions by genomic regions (enhancer/promoter/safe harbor) across time points
Region_data <- Count_regions(IS_raw = IS_raw, Patient_timepoint = Patient_timepoint)
# Plot genomic region count results (e.g., barplot/heatmap)
aa <- plot_regions(Region_data = Region_data)
print(aa)

########Test for potential dominant clones
IS_ratio=fit_cum_simple(IS_raw$SCount)
Cumulative_curve(IS_ratio)
# Generate ideogram plot (chromosome ideogram with IS positions marked)
# Second argument: genome build (e.g., "hg38" - to be defined)
aa <- ideogram_plot(IS_raw, output_dir = '.')