habipwalue/websitedesign-pijul: single_page

#!/usr/bin/env python3
"""
High-Quality Single Page Website Copier CLI Tool

A developer-controlled CLI program for copying individual web pages with maximum quality.
Focuses on single-page copying with optimal asset handling and styling preservation.
"""

import os
import sys
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
import json
import time
import re
from pathlib import Path
import jsbeautifier
import cssbeautifier
from bs4 import BeautifulSoup

class SinglePageCopier:
    def __init__(self, output_dir="output", verbose=False, js_mode="keep"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.verbose = verbose
        self.js_mode = js_mode  # Options: "keep", "disable", "extract", "prettify", "clean"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.assets_downloaded = {}
        self.inline_js_counter = 0
        self.inline_css_counter = 0
        
    def log(self, message):
        """Print message if verbose mode is enabled"""
        if self.verbose:
            print(f"[INFO] {message}")
            
    def error(self, message):
        """Print error message"""
        print(f"[ERROR] {message}")
        
    def success(self, message):
        """Print success message"""
        print(f"[SUCCESS] {message}")
        
    def download_asset(self, asset_url, base_url):
        """Download a single asset with optimal handling"""
        try:
            full_url = urljoin(base_url, asset_url)
            
            if full_url in self.assets_downloaded:
                return self.assets_downloaded[full_url]
                
            self.log(f"Downloading asset: {full_url}")
            
            response = self.session.get(full_url, timeout=30)
            response.raise_for_status()
            
            parsed_url = urlparse(full_url)
            path_parts = parsed_url.path.strip('/').split('/')
            
            is_font = any(full_url.endswith(ext) for ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot'])
            
            if len(path_parts) > 1:
                asset_dir = self.output_dir / "assets" / "/".join(path_parts[:-1])
                asset_dir.mkdir(parents=True, exist_ok=True)
                local_path = asset_dir / path_parts[-1]
            else:
                asset_dir = self.output_dir / "assets"
                asset_dir.mkdir(parents=True, exist_ok=True)
                local_path = asset_dir / (path_parts[0] if path_parts[0] else "asset")
                
            if parsed_url.query:
                filename = local_path.name
                if '.' not in filename:
                    content_type = response.headers.get('content-type', '')
                    if 'css' in content_type:
                        filename += '.css'
                    elif 'javascript' in content_type:
                        filename += '.js'
                    elif 'image' in content_type:
                        if 'png' in content_type:
                            filename += '.png'
                        elif 'jpeg' in content_type or 'jpg' in content_type:
                            filename += '.jpg'
                        elif 'gif' in content_type:
                            filename += '.gif'
                        elif 'svg' in content_type:
                            filename += '.svg'
                    elif 'font' in content_type or is_font:
                        if 'woff2' in content_type:
                            filename += '.woff2'
                        elif 'woff' in content_type:
                            filename += '.woff'
                        elif 'truetype' in content_type:
                            filename += '.ttf'
                        elif 'opentype' in content_type:
                            filename += '.otf'
                local_path = local_path.parent / filename
                
            with open(local_path, 'wb') as f:
                f.write(response.content)
                
            relative_path = os.path.relpath(local_path, self.output_dir)
            self.assets_downloaded[full_url] = relative_path
            
            if is_font:
                self.log(f"Font asset saved: {relative_path}")
            else:
                self.log(f"Asset saved: {relative_path}")
            return relative_path
            
        except Exception as e:
            self.error(f"Failed to download asset {asset_url}: {str(e)}")
            return asset_url  # Return original URL as fallback
            
    def process_css_content(self, css_content, base_url):
        """Process CSS content to download referenced assets with enhanced font handling"""
        url_pattern = r'url\(\s*["\']?([^"\')\s]+?)["\']?\s*\)'
        import_pattern = r'@import\s+["\']([^"\']+)["\']'
        
        def replace_css_url(match):
            original_url = match.group(1).strip()
            if original_url.startswith(('data:', 'http://', 'https://')):
                return match.group(0)  # Keep data URLs and absolute URLs
            
            local_path = self.download_asset(original_url, base_url)
            if local_path and local_path != original_url:
                if local_path.startswith('assets/'):
                    css_relative_path = f"../../../{local_path}"
                else:
                    css_relative_path = f"../../../assets/{local_path}"
                
                is_font = any(original_url.endswith(ext) for ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot'])
                if is_font:
                    self.log(f"Font URL rewritten: {original_url} -> {css_relative_path}")
                else:
                    self.log(f"CSS URL rewritten: {original_url} -> {css_relative_path}")
                return f'url("{css_relative_path}")'
            return match.group(0)
            
        def replace_import_url(match):
            original_url = match.group(1).strip()
            if original_url.startswith(('http://', 'https://')):
                return match.group(0)  # Keep absolute URLs
                
            local_path = self.download_asset(original_url, base_url)
            if local_path and local_path != original_url:
                if local_path.startswith('assets/'):
                    css_relative_path = f"../../../{local_path}"
                else:
                    css_relative_path = f"../../../assets/{local_path}"
                    
                self.log(f"CSS import rewritten: {original_url} -> {css_relative_path}")
                return f'@import "{css_relative_path}"'
            return match.group(0)
            
        css_content = re.sub(url_pattern, replace_css_url, css_content)
        css_content = re.sub(import_pattern, replace_import_url, css_content)
        
        return css_content
        
    def prettify_js(self, js_content):
        """Prettify JavaScript content for better readability"""
        try:
            options = jsbeautifier.default_options()
            options.indent_size = 2
            options.space_in_empty_paren = True
            return jsbeautifier.beautify(js_content, options)
        except Exception as e:
            self.log(f"Could not prettify JavaScript: {str(e)}")
            return js_content
            
    def prettify_css(self, css_content):
        """Prettify CSS content for better readability"""
        try:
            options = cssbeautifier.default_options()
            options.indent_size = 2
            options.selector_separator_newline = True
            return cssbeautifier.beautify(css_content, options)
        except Exception as e:
            self.log(f"Could not prettify CSS: {str(e)}")
            return css_content
            
    def prettify_html(self, html_content):
        """Prettify HTML content for better readability"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            return soup.prettify()
        except Exception as e:
            self.log(f"Could not prettify HTML: {str(e)}")
            return html_content
    
    def clean_visibility_styles(self, soup):
        """Remove CSS transforms and attributes that hide content in static copies"""
        try:
            for element in soup.find_all(attrs={"offscreen": True}):
                del element.attrs["offscreen"]
                self.log(f"Removed offscreen attribute from {element.name} element")
            
            for element in soup.find_all(style=True):
                style = element.get('style', '')
                if 'opacity:0' in style or 'translateY(' in style:
                    style_parts = [part.strip() for part in style.split(';') if part.strip()]
                    cleaned_parts = []
                    for part in style_parts:
                        if not any(hide_pattern in part for hide_pattern in ['opacity:0', 'translateY(', 'translateZ(']):
                            cleaned_parts.append(part)
                    
                    if cleaned_parts:
                        element['style'] = '; '.join(cleaned_parts)
                    else:
                        del element.attrs['style']
                    self.log(f"Cleaned visibility styles from {element.name} element")
            
            return soup
        except Exception as e:
            self.log(f"Could not clean visibility styles: {str(e)}")
            return soup
            
    def extract_inline_js(self, soup):
        """Extract inline JavaScript to separate files"""
        js_dir = self.output_dir / "assets" / "js" / "inline"
        js_dir.mkdir(parents=True, exist_ok=True)
        
        for script in soup.find_all('script'):
            if script.string and script.string.strip():
                self.inline_js_counter += 1
                js_filename = f"inline-{self.inline_js_counter}.js"
                js_path = js_dir / js_filename
                
                js_content = script.string
                if self.js_mode == "prettify":
                    js_content = self.prettify_js(js_content)
                
                with open(js_path, 'w', encoding='utf-8') as f:
                    f.write(js_content)
                
                script.string = ""
                script['src'] = f"assets/js/inline/{js_filename}"
                self.log(f"Extracted inline JS to: assets/js/inline/{js_filename}")
                
    def extract_inline_css(self, soup):
        """Extract inline CSS to separate files"""
        css_dir = self.output_dir / "assets" / "css" / "inline"
        css_dir.mkdir(parents=True, exist_ok=True)
        
        for style in soup.find_all('style'):
            if style.string and style.string.strip():
                self.inline_css_counter += 1
                css_filename = f"inline-{self.inline_css_counter}.css"
                css_path = css_dir / css_filename
                
                css_content = style.string
                if self.js_mode in ["prettify", "clean"]:
                    css_content = self.prettify_css(css_content)
                
                with open(css_path, 'w', encoding='utf-8') as f:
                    f.write(css_content)
                
                new_link = soup.new_tag('link')
                new_link['rel'] = 'stylesheet'
                new_link['href'] = f"assets/css/inline/{css_filename}"
                style.replace_with(new_link)
                self.log(f"Extracted inline CSS to: assets/css/inline/{css_filename}")
                
    def process_javascript_assets(self, soup, url):
        """Process JavaScript assets based on the selected mode"""
        if self.js_mode == "disable":
            self.log("Disabling all JavaScript...")
            for script in soup.find_all('script'):
                script.decompose()
            return
            
        if self.js_mode in ["extract", "prettify", "clean"]:
            self.extract_inline_js(soup)
            
        for script in soup.find_all('script', src=True):
            if script.get('src'):
                if self.js_mode == "disable":
                    script.decompose()
                    continue
                    
                original_src = script['src']
                
                if self.js_mode in ["prettify", "clean"]:
                    try:
                        js_url = urljoin(url, original_src)
                        js_response = self.session.get(js_url, timeout=30)
                        js_response.raise_for_status()
                        
                        js_content = js_response.text
                        if self.js_mode in ["prettify", "clean"]:
                            js_content = self.prettify_js(js_content)
                        
                        local_path = self.download_asset(original_src, url)
                        js_path = self.output_dir / local_path
                        
                        with open(js_path, 'w', encoding='utf-8') as f:
                            f.write(js_content)
                            
                        script['src'] = local_path
                        self.log(f"Processed JS file: {local_path}")
                        
                    except Exception as e:
                        self.log(f"Could not process JS file {original_src}: {str(e)}")
                        local_path = self.download_asset(original_src, url)
                        script['src'] = local_path
                else:
                    local_path = self.download_asset(original_src, url)
                    script['src'] = local_path

    def copy_page(self, url, output_filename=None):
        """Copy a single page with maximum quality and developer-friendly options"""
        try:
            self.log(f"Starting to copy page: {url}")
            self.log(f"JavaScript mode: {self.js_mode}")
            
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            if not output_filename:
                parsed_url = urlparse(url)
                path = parsed_url.path.strip('/')
                if not path or path.endswith('/'):
                    output_filename = "index.html"
                else:
                    output_filename = path.split('/')[-1]
                    if not output_filename.endswith('.html'):
                        output_filename += '.html'
                        
            output_path = self.output_dir / output_filename
            
            self.log(f"Processing HTML content...")
            
            if self.js_mode in ["extract", "prettify", "clean"]:
                self.extract_inline_css(soup)
            
            for link in soup.find_all('link', rel='stylesheet'):
                if link.get('href'):
                    original_href = link['href']
                    css_url = urljoin(url, original_href)
                    
                    try:
                        self.log(f"Processing CSS file: {css_url}")
                        css_response = self.session.get(css_url, timeout=30)
                        css_response.raise_for_status()
                        
                        if '@font-face' in css_response.text:
                            self.log(f"CSS contains @font-face declarations: {css_url}")
                        
                        processed_css = self.process_css_content(css_response.text, css_url)
                        
                        if self.js_mode in ["prettify", "clean"]:
                            processed_css = self.prettify_css(processed_css)
                            self.log(f"CSS file prettified: {original_href}")
                        
                        local_path = self.download_asset(original_href, url)
                        css_path = self.output_dir / local_path
                        
                        with open(css_path, 'w', encoding='utf-8') as f:
                            f.write(processed_css)
                            
                        link['href'] = local_path
                        self.log(f"CSS file processed and saved: {local_path}")
                            
                    except Exception as e:
                        self.log(f"Could not process CSS file {original_href}: {str(e)}")
                        local_path = self.download_asset(original_href, url)
                        link['href'] = local_path
                        
            self.process_javascript_assets(soup, url)
                    
            for img in soup.find_all('img', src=True):
                if img.get('src'):
                    local_path = self.download_asset(img['src'], url)
                    img['src'] = local_path
                    
            for img in soup.find_all('img', srcset=True):
                srcset = img['srcset']
                srcset_parts = []
                for part in srcset.split(','):
                    part = part.strip()
                    if ' ' in part:
                        src, descriptor = part.rsplit(' ', 1)
                        local_path = self.download_asset(src, url)
                        srcset_parts.append(f"{local_path} {descriptor}")
                    else:
                        local_path = self.download_asset(part, url)
                        srcset_parts.append(local_path)
                img['srcset'] = ', '.join(srcset_parts)
                
            for element in soup.find_all(style=True):
                style = element['style']
                url_pattern = r'url\(["\']?([^"\')\s]+)["\']?\)'
                
                def replace_style_url(match):
                    original_url = match.group(1)
                    if original_url.startswith(('data:', 'http://', 'https://')):
                        return match.group(0)
                    local_path = self.download_asset(original_url, url)
                    return f'url("{local_path}")'
                    
                element['style'] = re.sub(url_pattern, replace_style_url, style)
                
            for style_tag in soup.find_all('style'):
                if style_tag.string:
                    processed_css = self.process_css_content(style_tag.string, url)
                    style_tag.string = processed_css
                    
            for link in soup.find_all('link', href=True):
                rel = link.get('rel', [])
                if isinstance(rel, str):
                    rel = [rel]
                    
                if any(r in ['icon', 'shortcut icon', 'apple-touch-icon'] for r in rel):
                    local_path = self.download_asset(link['href'], url)
                    link['href'] = local_path
                elif 'preload' in rel:
                    original_href = link['href']
                    as_attr = link.get('as', '')
                    if as_attr == 'font' or any(original_href.endswith(ext) for ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot']):
                        local_path = self.download_asset(original_href, url)
                        if local_path and not local_path.startswith(('http://', 'https://', 'data:')):
                            link['href'] = local_path
                        else:
                            link['href'] = local_path
                        self.log(f"Font preload rewritten: {original_href} -> {local_path}")
                    else:
                        local_path = self.download_asset(original_href, url)
                        if local_path and not local_path.startswith(('http://', 'https://', 'data:')):
                            link['href'] = local_path
                        else:
                            link['href'] = local_path
                        self.log(f"Preload asset rewritten: {original_href} -> {local_path}")
                    
            meta_tag = soup.new_tag('meta')
            meta_tag.attrs['name'] = 'generator'
            meta_tag.attrs['content'] = f'Single Page Copier CLI Tool (JS Mode: {self.js_mode})'
            if soup.head:
                soup.head.append(meta_tag)
                
            if self.js_mode == "clean":
                soup = self.clean_visibility_styles(soup)
                self.log(f"Visibility styles cleaned for static display")
                
            html_output = str(soup)
            
            if self.js_mode in ["prettify", "clean"]:
                html_output = self.prettify_html(html_output)
                self.log(f"HTML output prettified")
            
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(html_output)
                
            self.success(f"Page successfully copied to: {output_path}")
            self.success(f"Downloaded {len(self.assets_downloaded)} assets")
            
            return {
                'success': True,
                'output_file': str(output_path),
                'assets_count': len(self.assets_downloaded),
                'url': url
            }
            
        except Exception as e:
            self.error(f"Failed to copy page {url}: {str(e)}")
            return {
                'success': False,
                'error': str(e),
                'url': url
            }

def main():
    parser = argparse.ArgumentParser(
        description='High-Quality Single Page Website Copier with Developer-Friendly Options',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python single_page_copier.py https://www.wealthfront.com/
  python single_page_copier.py https://www.wealthfront.com/cash --output cash.html
  python single_page_copier.py https://www.wealthfront.com/stock-investing -o stock.html -v
  python single_page_copier.py https://www.wealthfront.com/bonds --dir my_output --verbose
  
JavaScript Handling Examples:
  python single_page_copier.py https://www.wealthfront.com/ --js-mode disable
  python single_page_copier.py https://www.wealthfront.com/ --js-mode extract --verbose
  python single_page_copier.py https://www.wealthfront.com/ --js-mode prettify
  python single_page_copier.py https://www.wealthfront.com/ --js-mode clean
        """
    )
    
    parser.add_argument('url', help='URL of the page to copy')
    parser.add_argument('-o', '--output', help='Output filename (default: auto-generated)')
    parser.add_argument('-d', '--dir', default='output', help='Output directory (default: output)')
    parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
    parser.add_argument('--js-mode', choices=['keep', 'disable', 'extract', 'prettify', 'clean'], 
                       default='keep', help='''JavaScript handling mode:
  keep - Download JS files as-is (default)
  disable - Remove all JavaScript for static-only output
  extract - Extract inline JS/CSS to separate files
  prettify - Beautify minified JavaScript for readability
  clean - Extract + organize assets for developer-friendly structure''')
    
    args = parser.parse_args()
    
    if not args.url.startswith(('http://', 'https://')):
        print("[ERROR] URL must start with http:// or https://")
        sys.exit(1)
        
    copier = SinglePageCopier(output_dir=args.dir, verbose=args.verbose, js_mode=args.js_mode)
    
    print(f"Single Page Copier CLI Tool")
    print(f"URL: {args.url}")
    print(f"Output Directory: {args.dir}")
    print(f"JavaScript Mode: {args.js_mode}")
    print("-" * 50)
    
    result = copier.copy_page(args.url, args.output)
    
    if result['success']:
        print("-" * 50)
        print(f"✅ Copy completed successfully!")
        print(f"📄 Output file: {result['output_file']}")
        print(f"📦 Assets downloaded: {result['assets_count']}")
        print(f"🌐 Original URL: {result['url']}")
        print(f"\nTo view the copied page:")
        print(f"  cd {args.dir}")
        print(f"  python3 -m http.server 8000")
        print(f"  Open: http://localhost:8000/{os.path.basename(result['output_file'])}")
    else:
        print("-" * 50)
        print(f"❌ Copy failed!")
        print(f"Error: {result['error']}")
        sys.exit(1)

if __name__ == '__main__':
    main()