import pdfplumber  # Library for accurate PDF text extraction
import re  # Library for regular expressions to parse text patterns
import pandas as pd  # Library for creating and manipulating data tables
from openpyxl import load_workbook  # Library to load and modify Excel files
from openpyxl.styles import Alignment, Border, Side  # Styles for alignment and borders
from openpyxl.chart import BarChart, PieChart, Reference  # Classes for creating charts
import tempfile  # Library for creating temporary files
import os  # Library for operating system interactions
import sys  # Library for system-specific parameters
import subprocess  # Library for running external processes

# Open the PDF file using pdfplumber for better text handling
with pdfplumber.open('sales_data.pdf') as pdf:
    text = ''  # Initialize an empty string to hold all extracted text
    # Loop through each page in the PDF
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:  # Check if text is extracted
            text += page_text + ' '  # Append text and add space for separation

# Clean the text by replacing newlines with spaces for better parsing
text = text.replace('\n', ' ')

# Define patterns to match various phrasings, focusing on key data
# State pattern handles multi-word states like 'New York'
state_pattern = r"([A-Z]\w*(?: [A-Z]\w*)*)"
verb_pattern = r"(added|achieved|brought in|closed|hit|reached|scored|totaled|at)"
patterns = [
    state_pattern + r"'s (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)",  # e.g., "Oregon's Jordan Kim added $120,000"
    state_pattern + r" sales rep (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)",  # e.g., "California sales rep Alex Johnson closed $150,000"
    state_pattern + r" rep (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)",  # e.g., "Illinois rep Riley Wong totaled $140,000"
    r"in " + state_pattern + r", (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)",  # e.g., "in Nevada, Taylor Lee hit $80,000"
    state_pattern + r" with (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)",  # e.g., "Texas with Jamie Chen at $180,000"
]

# Find all matches using the patterns
matches = []
for pattern in patterns:
    matches.extend(re.findall(pattern, text))

# Define a dictionary to map states to regions
region_map = {
    'California': 'West', 'Nevada': 'West', 'Oregon': 'West',
    'New York': 'North', 'Pennsylvania': 'North', 'Illinois': 'North',
    'Texas': 'South', 'Florida': 'South', 'Georgia': 'South',
    'Massachusetts': 'East', 'Virginia': 'East', 'Maryland': 'East'
}

# Initialize an empty list for parsed data
data = []
# Process each match
for match in matches:
    state, rep, verb, amount = match
    # Convert amount to integer
    amount_int = int(amount.replace(',', ''))
    # Get region, default to 'Unknown'
    region = region_map.get(state, 'Unknown')
    # Append data (keep Sales as int for numeric formatting in Excel)
    data.append({'Region': region, 'State': state, 'Rep': rep, 'Sales': amount_int})

# Create a DataFrame
df = pd.DataFrame(data)

# Create a temporary Excel file
with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp:
    # Write DataFrame to Excel with custom sheet name
    writer = pd.ExcelWriter(tmp.name, engine='openpyxl')
    df.to_excel(writer, index=False, sheet_name='2nd Quarter Sales')
    writer.close()

    # Load the workbook for formatting and chart creation
    wb = load_workbook(tmp.name)
    ws = wb['2nd Quarter Sales']

    # Function to apply consistent formatting to a worksheet (right-align sales, currency, borders, auto-adjust widths)
    def format_worksheet(worksheet, sales_col='B'):
        # Right-align and format Sales column as currency, starting from row 2
        for row in worksheet.iter_rows(min_row=2, max_row=worksheet.max_row, min_col=2, max_col=2):
            for cell in row:
                cell.number_format = '$#,##0'
                cell.alignment = Alignment(horizontal='right')

        # Add thin borders to all cells in the used range
        thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))
        for row in worksheet.iter_rows(min_row=1, max_row=worksheet.max_row, min_col=1, max_col=worksheet.max_column):
            for cell in row:
                cell.border = thin_border

        # Auto-adjust column widths, accounting for formatted values in Sales column
        for column in worksheet.iter_cols(min_row=1, max_row=worksheet.max_row, min_col=1, max_col=worksheet.max_column):
            max_length = 0
            column_letter = column[0].column_letter
            for cell in column:
                try:
                    if column_letter == sales_col and cell.row > 1:  # For Sales data rows, use formatted length
                        formatted_value = f"${cell.value:,}"
                        length = len(formatted_value)
                    else:
                        length = len(str(cell.value))
                    if length > max_length:
                        max_length = length
                except:
                    pass
            adjusted_width = max_length + 2  # Add padding
            worksheet.column_dimensions[column_letter].width = adjusted_width

    # Apply formatting to main data sheet
    format_worksheet(ws, sales_col='D')

    # Aggregate data for charts
    region_sums = df.groupby('Region')['Sales'].sum().reset_index()
    state_sums = df.groupby('State')['Sales'].sum().reset_index()
    rep_sums = df.groupby('Rep')['Sales'].sum().reset_index()

    # Create sheet for Sales by Region
    region_ws = wb.create_sheet(title='Sales by Region')
    # Write aggregated data
    region_ws.append(['Region', 'Sales'])
    for row in region_sums.itertuples(index=False):
        region_ws.append([row.Region, row.Sales])
    # Apply formatting
    format_worksheet(region_ws)
    # Create bar chart
    bar_chart = BarChart()
    bar_chart.title = "Sales by Region"
    bar_chart.x_axis.title = "Region"
    bar_chart.y_axis.title = "Sales ($)"
    data = Reference(region_ws, min_col=2, min_row=2, max_row=region_ws.max_row)
    categories = Reference(region_ws, min_col=1, min_row=2, max_row=region_ws.max_row)
    bar_chart.add_data(data)
    bar_chart.set_categories(categories)
    region_ws.add_chart(bar_chart, "D2")

    # Create sheet for Sales by State
    state_ws = wb.create_sheet(title='Sales by State')
    # Write aggregated data
    state_ws.append(['State', 'Sales'])
    for row in state_sums.itertuples(index=False):
        state_ws.append([row.State, row.Sales])
    # Apply formatting
    format_worksheet(state_ws)
    # Create bar chart (vertical bars)
    col_chart = BarChart()
    col_chart.title = "Sales by State"
    col_chart.x_axis.title = "State"
    col_chart.y_axis.title = "Sales ($)"
    data = Reference(state_ws, min_col=2, min_row=2, max_row=state_ws.max_row)
    categories = Reference(state_ws, min_col=1, min_row=2, max_row=state_ws.max_row)
    col_chart.add_data(data)
    col_chart.set_categories(categories)
    state_ws.add_chart(col_chart, "D2")

    # Create sheet for Sales by Rep
    rep_ws = wb.create_sheet(title='Sales by Rep')
    # Write aggregated data
    rep_ws.append(['Rep', 'Sales'])
    for row in rep_sums.itertuples(index=False):
        rep_ws.append([row.Rep, row.Sales])
    # Apply formatting
    format_worksheet(rep_ws)
    # Create pie chart
    pie_chart = PieChart()
    pie_chart.title = "Sales by Rep"
    data = Reference(rep_ws, min_col=2, min_row=2, max_row=rep_ws.max_row)
    categories = Reference(rep_ws, min_col=1, min_row=2, max_row=rep_ws.max_row)
    pie_chart.add_data(data)
    pie_chart.set_categories(categories)
    rep_ws.add_chart(pie_chart, "D2")

    # Save the formatted workbook
    wb.save(tmp.name)

# Open the file cross-platform
if sys.platform == "win32":
    os.startfile(tmp.name)
elif sys.platform == "darwin":
    subprocess.call(["open", tmp.name])
else:
    try:
        subprocess.call(["xdg-open", tmp.name])
    except OSError:
        print("Could not open the file automatically. Please open it manually at:", tmp.name)