import os
import sys
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
import json
import time
import re
from pathlib import Path
import jsbeautifier
import cssbeautifier
from bs4 import BeautifulSoup
class SinglePageCopier:
def __init__(self, output_dir="output", verbose=False, js_mode="keep"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.verbose = verbose
self.js_mode = js_mode self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.assets_downloaded = {}
self.inline_js_counter = 0
self.inline_css_counter = 0
def log(self, message):
if self.verbose:
print(f"[INFO] {message}")
def error(self, message):
print(f"[ERROR] {message}")
def success(self, message):
print(f"[SUCCESS] {message}")
def download_asset(self, asset_url, base_url):
try:
full_url = urljoin(base_url, asset_url)
if full_url in self.assets_downloaded:
return self.assets_downloaded[full_url]
self.log(f"Downloading asset: {full_url}")
response = self.session.get(full_url, timeout=30)
response.raise_for_status()
parsed_url = urlparse(full_url)
path_parts = parsed_url.path.strip('/').split('/')
is_font = any(full_url.endswith(ext) for ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot'])
if len(path_parts) > 1:
asset_dir = self.output_dir / "assets" / "/".join(path_parts[:-1])
asset_dir.mkdir(parents=True, exist_ok=True)
local_path = asset_dir / path_parts[-1]
else:
asset_dir = self.output_dir / "assets"
asset_dir.mkdir(parents=True, exist_ok=True)
local_path = asset_dir / (path_parts[0] if path_parts[0] else "asset")
if parsed_url.query:
filename = local_path.name
if '.' not in filename:
content_type = response.headers.get('content-type', '')
if 'css' in content_type:
filename += '.css'
elif 'javascript' in content_type:
filename += '.js'
elif 'image' in content_type:
if 'png' in content_type:
filename += '.png'
elif 'jpeg' in content_type or 'jpg' in content_type:
filename += '.jpg'
elif 'gif' in content_type:
filename += '.gif'
elif 'svg' in content_type:
filename += '.svg'
elif 'font' in content_type or is_font:
if 'woff2' in content_type:
filename += '.woff2'
elif 'woff' in content_type:
filename += '.woff'
elif 'truetype' in content_type:
filename += '.ttf'
elif 'opentype' in content_type:
filename += '.otf'
local_path = local_path.parent / filename
with open(local_path, 'wb') as f:
f.write(response.content)
relative_path = os.path.relpath(local_path, self.output_dir)
self.assets_downloaded[full_url] = relative_path
if is_font:
self.log(f"Font asset saved: {relative_path}")
else:
self.log(f"Asset saved: {relative_path}")
return relative_path
except Exception as e:
self.error(f"Failed to download asset {asset_url}: {str(e)}")
return asset_url
def process_css_content(self, css_content, base_url):
url_pattern = r'url\(\s*["\']?([^"\')\s]+?)["\']?\s*\)'
import_pattern = r'@import\s+["\']([^"\']+)["\']'
def replace_css_url(match):
original_url = match.group(1).strip()
if original_url.startswith(('data:', 'http://', 'https://')):
return match.group(0)
local_path = self.download_asset(original_url, base_url)
if local_path and local_path != original_url:
if local_path.startswith('assets/'):
css_relative_path = f"../../../{local_path}"
else:
css_relative_path = f"../../../assets/{local_path}"
is_font = any(original_url.endswith(ext) for ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot'])
if is_font:
self.log(f"Font URL rewritten: {original_url} -> {css_relative_path}")
else:
self.log(f"CSS URL rewritten: {original_url} -> {css_relative_path}")
return f'url("{css_relative_path}")'
return match.group(0)
def replace_import_url(match):
original_url = match.group(1).strip()
if original_url.startswith(('http://', 'https://')):
return match.group(0)
local_path = self.download_asset(original_url, base_url)
if local_path and local_path != original_url:
if local_path.startswith('assets/'):
css_relative_path = f"../../../{local_path}"
else:
css_relative_path = f"../../../assets/{local_path}"
self.log(f"CSS import rewritten: {original_url} -> {css_relative_path}")
return f'@import "{css_relative_path}"'
return match.group(0)
css_content = re.sub(url_pattern, replace_css_url, css_content)
css_content = re.sub(import_pattern, replace_import_url, css_content)
return css_content
def prettify_js(self, js_content):
try:
options = jsbeautifier.default_options()
options.indent_size = 2
options.space_in_empty_paren = True
return jsbeautifier.beautify(js_content, options)
except Exception as e:
self.log(f"Could not prettify JavaScript: {str(e)}")
return js_content
def prettify_css(self, css_content):
try:
options = cssbeautifier.default_options()
options.indent_size = 2
options.selector_separator_newline = True
return cssbeautifier.beautify(css_content, options)
except Exception as e:
self.log(f"Could not prettify CSS: {str(e)}")
return css_content
def prettify_html(self, html_content):
try:
soup = BeautifulSoup(html_content, 'html.parser')
return soup.prettify()
except Exception as e:
self.log(f"Could not prettify HTML: {str(e)}")
return html_content
def clean_visibility_styles(self, soup):
try:
for element in soup.find_all(attrs={"offscreen": True}):
del element.attrs["offscreen"]
self.log(f"Removed offscreen attribute from {element.name} element")
for element in soup.find_all(style=True):
style = element.get('style', '')
if 'opacity:0' in style or 'translateY(' in style:
style_parts = [part.strip() for part in style.split(';') if part.strip()]
cleaned_parts = []
for part in style_parts:
if not any(hide_pattern in part for hide_pattern in ['opacity:0', 'translateY(', 'translateZ(']):
cleaned_parts.append(part)
if cleaned_parts:
element['style'] = '; '.join(cleaned_parts)
else:
del element.attrs['style']
self.log(f"Cleaned visibility styles from {element.name} element")
return soup
except Exception as e:
self.log(f"Could not clean visibility styles: {str(e)}")
return soup
def extract_inline_js(self, soup):
js_dir = self.output_dir / "assets" / "js" / "inline"
js_dir.mkdir(parents=True, exist_ok=True)
for script in soup.find_all('script'):
if script.string and script.string.strip():
self.inline_js_counter += 1
js_filename = f"inline-{self.inline_js_counter}.js"
js_path = js_dir / js_filename
js_content = script.string
if self.js_mode == "prettify":
js_content = self.prettify_js(js_content)
with open(js_path, 'w', encoding='utf-8') as f:
f.write(js_content)
script.string = ""
script['src'] = f"assets/js/inline/{js_filename}"
self.log(f"Extracted inline JS to: assets/js/inline/{js_filename}")
def extract_inline_css(self, soup):
css_dir = self.output_dir / "assets" / "css" / "inline"
css_dir.mkdir(parents=True, exist_ok=True)
for style in soup.find_all('style'):
if style.string and style.string.strip():
self.inline_css_counter += 1
css_filename = f"inline-{self.inline_css_counter}.css"
css_path = css_dir / css_filename
css_content = style.string
if self.js_mode in ["prettify", "clean"]:
css_content = self.prettify_css(css_content)
with open(css_path, 'w', encoding='utf-8') as f:
f.write(css_content)
new_link = soup.new_tag('link')
new_link['rel'] = 'stylesheet'
new_link['href'] = f"assets/css/inline/{css_filename}"
style.replace_with(new_link)
self.log(f"Extracted inline CSS to: assets/css/inline/{css_filename}")
def process_javascript_assets(self, soup, url):
if self.js_mode == "disable":
self.log("Disabling all JavaScript...")
for script in soup.find_all('script'):
script.decompose()
return
if self.js_mode in ["extract", "prettify", "clean"]:
self.extract_inline_js(soup)
for script in soup.find_all('script', src=True):
if script.get('src'):
if self.js_mode == "disable":
script.decompose()
continue
original_src = script['src']
if self.js_mode in ["prettify", "clean"]:
try:
js_url = urljoin(url, original_src)
js_response = self.session.get(js_url, timeout=30)
js_response.raise_for_status()
js_content = js_response.text
if self.js_mode in ["prettify", "clean"]:
js_content = self.prettify_js(js_content)
local_path = self.download_asset(original_src, url)
js_path = self.output_dir / local_path
with open(js_path, 'w', encoding='utf-8') as f:
f.write(js_content)
script['src'] = local_path
self.log(f"Processed JS file: {local_path}")
except Exception as e:
self.log(f"Could not process JS file {original_src}: {str(e)}")
local_path = self.download_asset(original_src, url)
script['src'] = local_path
else:
local_path = self.download_asset(original_src, url)
script['src'] = local_path
def copy_page(self, url, output_filename=None):
try:
self.log(f"Starting to copy page: {url}")
self.log(f"JavaScript mode: {self.js_mode}")
response = self.session.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
if not output_filename:
parsed_url = urlparse(url)
path = parsed_url.path.strip('/')
if not path or path.endswith('/'):
output_filename = "index.html"
else:
output_filename = path.split('/')[-1]
if not output_filename.endswith('.html'):
output_filename += '.html'
output_path = self.output_dir / output_filename
self.log(f"Processing HTML content...")
if self.js_mode in ["extract", "prettify", "clean"]:
self.extract_inline_css(soup)
for link in soup.find_all('link', rel='stylesheet'):
if link.get('href'):
original_href = link['href']
css_url = urljoin(url, original_href)
try:
self.log(f"Processing CSS file: {css_url}")
css_response = self.session.get(css_url, timeout=30)
css_response.raise_for_status()
if '@font-face' in css_response.text:
self.log(f"CSS contains @font-face declarations: {css_url}")
processed_css = self.process_css_content(css_response.text, css_url)
if self.js_mode in ["prettify", "clean"]:
processed_css = self.prettify_css(processed_css)
self.log(f"CSS file prettified: {original_href}")
local_path = self.download_asset(original_href, url)
css_path = self.output_dir / local_path
with open(css_path, 'w', encoding='utf-8') as f:
f.write(processed_css)
link['href'] = local_path
self.log(f"CSS file processed and saved: {local_path}")
except Exception as e:
self.log(f"Could not process CSS file {original_href}: {str(e)}")
local_path = self.download_asset(original_href, url)
link['href'] = local_path
self.process_javascript_assets(soup, url)
for img in soup.find_all('img', src=True):
if img.get('src'):
local_path = self.download_asset(img['src'], url)
img['src'] = local_path
for img in soup.find_all('img', srcset=True):
srcset = img['srcset']
srcset_parts = []
for part in srcset.split(','):
part = part.strip()
if ' ' in part:
src, descriptor = part.rsplit(' ', 1)
local_path = self.download_asset(src, url)
srcset_parts.append(f"{local_path} {descriptor}")
else:
local_path = self.download_asset(part, url)
srcset_parts.append(local_path)
img['srcset'] = ', '.join(srcset_parts)
for element in soup.find_all(style=True):
style = element['style']
url_pattern = r'url\(["\']?([^"\')\s]+)["\']?\)'
def replace_style_url(match):
original_url = match.group(1)
if original_url.startswith(('data:', 'http://', 'https://')):
return match.group(0)
local_path = self.download_asset(original_url, url)
return f'url("{local_path}")'
element['style'] = re.sub(url_pattern, replace_style_url, style)
for style_tag in soup.find_all('style'):
if style_tag.string:
processed_css = self.process_css_content(style_tag.string, url)
style_tag.string = processed_css
for link in soup.find_all('link', href=True):
rel = link.get('rel', [])
if isinstance(rel, str):
rel = [rel]
if any(r in ['icon', 'shortcut icon', 'apple-touch-icon'] for r in rel):
local_path = self.download_asset(link['href'], url)
link['href'] = local_path
elif 'preload' in rel:
original_href = link['href']
as_attr = link.get('as', '')
if as_attr == 'font' or any(original_href.endswith(ext) for ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot']):
local_path = self.download_asset(original_href, url)
if local_path and not local_path.startswith(('http://', 'https://', 'data:')):
link['href'] = local_path
else:
link['href'] = local_path
self.log(f"Font preload rewritten: {original_href} -> {local_path}")
else:
local_path = self.download_asset(original_href, url)
if local_path and not local_path.startswith(('http://', 'https://', 'data:')):
link['href'] = local_path
else:
link['href'] = local_path
self.log(f"Preload asset rewritten: {original_href} -> {local_path}")
meta_tag = soup.new_tag('meta')
meta_tag.attrs['name'] = 'generator'
meta_tag.attrs['content'] = f'Single Page Copier CLI Tool (JS Mode: {self.js_mode})'
if soup.head:
soup.head.append(meta_tag)
if self.js_mode == "clean":
soup = self.clean_visibility_styles(soup)
self.log(f"Visibility styles cleaned for static display")
html_output = str(soup)
if self.js_mode in ["prettify", "clean"]:
html_output = self.prettify_html(html_output)
self.log(f"HTML output prettified")
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_output)
self.success(f"Page successfully copied to: {output_path}")
self.success(f"Downloaded {len(self.assets_downloaded)} assets")
return {
'success': True,
'output_file': str(output_path),
'assets_count': len(self.assets_downloaded),
'url': url
}
except Exception as e:
self.error(f"Failed to copy page {url}: {str(e)}")
return {
'success': False,
'error': str(e),
'url': url
}
def main():
parser = argparse.ArgumentParser(
description='High-Quality Single Page Website Copier with Developer-Friendly Options',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python single_page_copier.py https://www.wealthfront.com/
python single_page_copier.py https://www.wealthfront.com/cash --output cash.html
python single_page_copier.py https://www.wealthfront.com/stock-investing -o stock.html -v
python single_page_copier.py https://www.wealthfront.com/bonds --dir my_output --verbose
JavaScript Handling Examples:
python single_page_copier.py https://www.wealthfront.com/ --js-mode disable
python single_page_copier.py https://www.wealthfront.com/ --js-mode extract --verbose
python single_page_copier.py https://www.wealthfront.com/ --js-mode prettify
python single_page_copier.py https://www.wealthfront.com/ --js-mode clean
"""
)
parser.add_argument('url', help='URL of the page to copy')
parser.add_argument('-o', '--output', help='Output filename (default: auto-generated)')
parser.add_argument('-d', '--dir', default='output', help='Output directory (default: output)')
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
parser.add_argument('--js-mode', choices=['keep', 'disable', 'extract', 'prettify', 'clean'],
default='keep', help='''JavaScript handling mode:
keep - Download JS files as-is (default)
disable - Remove all JavaScript for static-only output
extract - Extract inline JS/CSS to separate files
prettify - Beautify minified JavaScript for readability
clean - Extract + organize assets for developer-friendly structure''')
args = parser.parse_args()
if not args.url.startswith(('http://', 'https://')):
print("[ERROR] URL must start with http:// or https://")
sys.exit(1)
copier = SinglePageCopier(output_dir=args.dir, verbose=args.verbose, js_mode=args.js_mode)
print(f"Single Page Copier CLI Tool")
print(f"URL: {args.url}")
print(f"Output Directory: {args.dir}")
print(f"JavaScript Mode: {args.js_mode}")
print("-" * 50)
result = copier.copy_page(args.url, args.output)
if result['success']:
print("-" * 50)
print(f"✅ Copy completed successfully!")
print(f"📄 Output file: {result['output_file']}")
print(f"📦 Assets downloaded: {result['assets_count']}")
print(f"🌐 Original URL: {result['url']}")
print(f"\nTo view the copied page:")
print(f" cd {args.dir}")
print(f" python3 -m http.server 8000")
print(f" Open: http://localhost:8000/{os.path.basename(result['output_file'])}")
else:
print("-" * 50)
print(f"❌ Copy failed!")
print(f"Error: {result['error']}")
sys.exit(1)
if __name__ == '__main__':
main()