import pdfplumber # Library for accurate PDF text extraction import re # Library for regular expressions to parse text patterns import pandas as pd # Library for creating and manipulating data tables from openpyxl import load_workbook # Library to load and modify Excel files from openpyxl.styles import Alignment, Border, Side # Styles for alignment and borders from openpyxl.chart import BarChart, PieChart, Reference # Classes for creating charts import tempfile # Library for creating temporary files import os # Library for operating system interactions import sys # Library for system-specific parameters import subprocess # Library for running external processes # Open the PDF file using pdfplumber for better text handling with pdfplumber.open('sales_data.pdf') as pdf: text = '' # Initialize an empty string to hold all extracted text # Loop through each page in the PDF for page in pdf.pages: page_text = page.extract_text() if page_text: # Check if text is extracted text += page_text + ' ' # Append text and add space for separation # Clean the text by replacing newlines with spaces for better parsing text = text.replace('\n', ' ') # Define patterns to match various phrasings, focusing on key data # State pattern handles multi-word states like 'New York' state_pattern = r"([A-Z]\w*(?: [A-Z]\w*)*)" verb_pattern = r"(added|achieved|brought in|closed|hit|reached|scored|totaled|at)" patterns = [ state_pattern + r"'s (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)", # e.g., "Oregon's Jordan Kim added $120,000" state_pattern + r" sales rep (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)", # e.g., "California sales rep Alex Johnson closed $150,000" state_pattern + r" rep (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)", # e.g., "Illinois rep Riley Wong totaled $140,000" r"in " + state_pattern + r", (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)", # e.g., "in Nevada, Taylor Lee hit $80,000" state_pattern + r" with (\w+ \w+) " + verb_pattern + r" \$(\d{1,3}(?:,\d{3})*)", # e.g., "Texas with Jamie Chen at $180,000" ] # Find all matches using the patterns matches = [] for pattern in patterns: matches.extend(re.findall(pattern, text)) # Define a dictionary to map states to regions region_map = { 'California': 'West', 'Nevada': 'West', 'Oregon': 'West', 'New York': 'North', 'Pennsylvania': 'North', 'Illinois': 'North', 'Texas': 'South', 'Florida': 'South', 'Georgia': 'South', 'Massachusetts': 'East', 'Virginia': 'East', 'Maryland': 'East' } # Initialize an empty list for parsed data data = [] # Process each match for match in matches: state, rep, verb, amount = match # Convert amount to integer amount_int = int(amount.replace(',', '')) # Get region, default to 'Unknown' region = region_map.get(state, 'Unknown') # Append data (keep Sales as int for numeric formatting in Excel) data.append({'Region': region, 'State': state, 'Rep': rep, 'Sales': amount_int}) # Create a DataFrame df = pd.DataFrame(data) # Create a temporary Excel file with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp: # Write DataFrame to Excel with custom sheet name writer = pd.ExcelWriter(tmp.name, engine='openpyxl') df.to_excel(writer, index=False, sheet_name='2nd Quarter Sales') writer.close() # Load the workbook for formatting and chart creation wb = load_workbook(tmp.name) ws = wb['2nd Quarter Sales'] # Function to apply consistent formatting to a worksheet (right-align sales, currency, borders, auto-adjust widths) def format_worksheet(worksheet, sales_col='B'): # Right-align and format Sales column as currency, starting from row 2 for row in worksheet.iter_rows(min_row=2, max_row=worksheet.max_row, min_col=2, max_col=2): for cell in row: cell.number_format = '$#,##0' cell.alignment = Alignment(horizontal='right') # Add thin borders to all cells in the used range thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin')) for row in worksheet.iter_rows(min_row=1, max_row=worksheet.max_row, min_col=1, max_col=worksheet.max_column): for cell in row: cell.border = thin_border # Auto-adjust column widths, accounting for formatted values in Sales column for column in worksheet.iter_cols(min_row=1, max_row=worksheet.max_row, min_col=1, max_col=worksheet.max_column): max_length = 0 column_letter = column[0].column_letter for cell in column: try: if column_letter == sales_col and cell.row > 1: # For Sales data rows, use formatted length formatted_value = f"${cell.value:,}" length = len(formatted_value) else: length = len(str(cell.value)) if length > max_length: max_length = length except: pass adjusted_width = max_length + 2 # Add padding worksheet.column_dimensions[column_letter].width = adjusted_width # Apply formatting to main data sheet format_worksheet(ws, sales_col='D') # Aggregate data for charts region_sums = df.groupby('Region')['Sales'].sum().reset_index() state_sums = df.groupby('State')['Sales'].sum().reset_index() rep_sums = df.groupby('Rep')['Sales'].sum().reset_index() # Create sheet for Sales by Region region_ws = wb.create_sheet(title='Sales by Region') # Write aggregated data region_ws.append(['Region', 'Sales']) for row in region_sums.itertuples(index=False): region_ws.append([row.Region, row.Sales]) # Apply formatting format_worksheet(region_ws) # Create bar chart bar_chart = BarChart() bar_chart.title = "Sales by Region" bar_chart.x_axis.title = "Region" bar_chart.y_axis.title = "Sales ($)" data = Reference(region_ws, min_col=2, min_row=2, max_row=region_ws.max_row) categories = Reference(region_ws, min_col=1, min_row=2, max_row=region_ws.max_row) bar_chart.add_data(data) bar_chart.set_categories(categories) region_ws.add_chart(bar_chart, "D2") # Create sheet for Sales by State state_ws = wb.create_sheet(title='Sales by State') # Write aggregated data state_ws.append(['State', 'Sales']) for row in state_sums.itertuples(index=False): state_ws.append([row.State, row.Sales]) # Apply formatting format_worksheet(state_ws) # Create bar chart (vertical bars) col_chart = BarChart() col_chart.title = "Sales by State" col_chart.x_axis.title = "State" col_chart.y_axis.title = "Sales ($)" data = Reference(state_ws, min_col=2, min_row=2, max_row=state_ws.max_row) categories = Reference(state_ws, min_col=1, min_row=2, max_row=state_ws.max_row) col_chart.add_data(data) col_chart.set_categories(categories) state_ws.add_chart(col_chart, "D2") # Create sheet for Sales by Rep rep_ws = wb.create_sheet(title='Sales by Rep') # Write aggregated data rep_ws.append(['Rep', 'Sales']) for row in rep_sums.itertuples(index=False): rep_ws.append([row.Rep, row.Sales]) # Apply formatting format_worksheet(rep_ws) # Create pie chart pie_chart = PieChart() pie_chart.title = "Sales by Rep" data = Reference(rep_ws, min_col=2, min_row=2, max_row=rep_ws.max_row) categories = Reference(rep_ws, min_col=1, min_row=2, max_row=rep_ws.max_row) pie_chart.add_data(data) pie_chart.set_categories(categories) rep_ws.add_chart(pie_chart, "D2") # Save the formatted workbook wb.save(tmp.name) # Open the file cross-platform if sys.platform == "win32": os.startfile(tmp.name) elif sys.platform == "darwin": subprocess.call(["open", tmp.name]) else: try: subprocess.call(["xdg-open", tmp.name]) except OSError: print("Could not open the file automatically. Please open it manually at:", tmp.name)