<pre>#!/usr/bin/env python3

# we need these...
import re
import os
import json
import requests
import pdfplumber
import mimetypes
import hashlib
import html
import mysql.connector
from bs4 import BeautifulSoup
from datetime import date, timedelta, datetime
from dateutil.parser import parse
import sys, getopt
from pathlib import Path
from pypdf import PdfReader
from pypdf.errors import PdfReadError
# sys.tracebacklimit=0
from requests import Request, Session

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import warnings
warnings.filterwarnings('ignore', message='Unverified HTTPS request')

# how many days back we search (Zero "0" is today)
if len(sys.argv) > 1:
	MAX_DAYS = sys.argv[1];
	max_str = str(MAX_DAYS);
	print('MAX_DAYS = '+max_str);
else:
	MAX_DAYS = 0;

MAX_DAYS = int(MAX_DAYS);


BASE_DIR = '/home/gtf/REGGIE/reggie/scrapers';
jdate	 = str(date.today())

# List of terms to require in every article.
KEYWORDS = [
    "Artificial Intelligence", "AI", "Autonomous Vehicles", "Border Gateway Protocol", "BGP",
    "Blockchain", "Cloud", "Communications", "Critical Infrastructure", "Cryptocurrency",
    "Cyber Insurance", "Cybersecurity", "Cybersecurity Standards", "Electric Vehicles", "Emergency Preparedness",
    "Encryption", "Industrial Control System", "ICS", "Incident Reporting", "Information Sharing",
    "Internet of Things", "IoT", "Machine Learning", "ML", "Modernization",
    "Positioning Navigation and Timing", "PNT", "Privacy", "Procurement", "Quantum Computing",
    "Ransomware", "Satellite", "Software Bill of Materials", "SBOM", "Secure by Design",
    "Supercomputing", "Supply chain", "Threats", "Unmanned Aircraft Systems", "UAS",
    "Workforce", "Zero Trust", "large language model", "large language models", "LLMs", "LLM",
    "resilience", "outages", "outage", "terrorist", "terrorists", "terrorism", "vulnerability",
    "vulnerabilities", "risk assessment", "risk assessments", "information technology",
    "IT", "information technologies"

]
# lets add the word "the" for testing only...
# KEYWORDS.append("the")

# start an array to add articles to it later in the script...
array = [];

# get page with standard python request and custom headers (ignoring SSL certs)
def requestPage(url):
	headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'};
	page = requests.get(url, headers=headers, verify=False, timeout=10);
	html = page.text;

	insertPAGES(SITE_ID, MAX_DAYS, url, html, '');
	return html;

# get page with wget
def wgetPage(url):
	ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36';
	os.system("wget -o /dev/null -O /tmp/wget --user-agent=\"\" "+url);
	html = getHTML('/tmp/wget');

	insertPAGES(SITE_ID, MAX_DAYS, url, html, '');
	return html;


# get the html we already downloaded...
def getHTML(html_file):
	with open(html_file) as file:
		html = file.read();
	return html;


# get page by downloading and saving
def savePage(url):
	html = '';
	os.system(BASE_DIR+"/save_page.sh "+url);
	os.system("file /tmp/save_page.txt > /tmp/save_page.ext");
	with open('/tmp/save_page.ext') as f:
		if 'PDF' in f.read():
			# Step 2: Extract text from the PDF
			with pdfplumber.open('/tmp/save_page.txt') as pdf:
				for page in pdf.pages:
					html += page.extract_text();
		else:
			file = Path('/tmp/save_page.txt');
			if(os.stat('/tmp/save_page.txt').st_size):
				html = file.read_text();

	insertPAGES(SITE_ID, MAX_DAYS, url, html, '');
	return html;


# get a PDF document
def getPDF(url):
	response = requests.get(url, verify=False);

	# Save the PDF to a local file
	with open('/tmp/downloaded.pdf', 'wb') as f:
		f.write(response.content);

	# Step 2: Extract text from the PDF
	with pdfplumber.open('/tmp/downloaded.pdf') as pdf:
		text = ''
		for page in pdf.pages:
			text += page.extract_text();

	insertPAGES(SITE_ID, MAX_DAYS, url, text, '');
	return(text);

# clean strings...
def cleanStr(string):
	clean = str(string);
	clean = re.sub(r"<.*?>"," ",clean);
	clean = re.sub(r'\s+', ' ', clean);
	clean = re.sub(r'\n+', ' ', clean);
	soup  = BeautifulSoup(clean,'html.parser')
	clean = soup.get_text();
	clean = clean.strip();
	return clean;


def KeywordFound():
	newTitle = cleanStr(title);
	newBody  = cleanStr(body);

	# make sure at least one KEYWORD is in the title or the body of the article...
	count = 0
	for key in KEYWORDS:
		key	= key.string.lower();
		body	= body.string.lower();
		title	= title.string.lower();
		if key in body:  count = 1; print(key);
		if key in title: count = 1; print(key);

	# if at least one KEYWORD was found...
	if count == 1:
		x = {	"siteid": SITE_ID,
			"agency": AGENCY,
			"title":  newTitle.encode('ascii', 'ignore').decode(),
			"link":   link,
			"body":   newBody.encode('ascii', 'ignore').decode(),
			"date":   parse(date).strftime('%Y-%m-%d')
			}
		# add "x" data to the array...
		if(x):
			array.append(x)
			ttl = newTitle.encode('ascii', 'ignore').decode();
			bod = newBody.encode('ascii', 'ignore').decode();
			dat = parse(date).strftime('%Y-%m-%d');

			print("array.append = "+dat+" "+link);
			insertJSON(SITE_ID, dat, AGENCY, link, ttl, bod);

	return;

# save the JSON array
def saveJSON(array):
	# if the array is empty do nothing...
	if not array: return;

	# set up the JSON outfile naming convention...
	JSON_FILE = BASE_DIR+"/json/"+SITE_ID+"-"+jdate+".json"

	# print to file as json...
	with open(JSON_FILE, "w") as file: file.write(json.dumps(array,indent=2))

	# print to screen as json...
	# print(json.dumps(array,indent=2));
	return;


# for quicker view while developing
def printDLT():
	print(date);
	print(link);
	print(title);
	print("=========================");
	return;

# make sure its in the date range we want...
def inDateRange(date):
	d1 = parse(date).strftime('%Y-%m-%d');
	max = datetime.now() - timedelta(days=MAX_DAYS);
	d2 = max.strftime('%Y-%m-%d');
	if d1 >= d2:
		return d1;
	else:
		return 0;

# save pages in mysql
def insertPAGES(sid, days, url, html, json):
	# connect to the local mysql database in case we need to start inserting stuff...
	db = mysql.connector.connect(user='root', password='B@dZ@ck',database='zradev',unix_socket='/var/run/mysqld/mysqld.sock');

	cursor = db.cursor();
	# define the query
	sql = "INSERT INTO pages (sid, date, days, url, html, json) VALUES (%s, NOW(), %s, %s, %s, %s)";
	values = (sid, days, url, html, json);

	# execute and commit
	cursor.execute(sql, values);
	db.commit();
	cursor.close();
	db.close();
	return;


# save the JSON in mysql
def insertJSON(sid, date, agency, link, title, body):
	# connect to the local mysql database in case we need to start inserting stuff...
	db = mysql.connector.connect(user='root', password='B@dZ@ck',database='zradev',unix_socket='/var/run/mysqld/mysqld.sock');

	cursor = db.cursor();
	# define the query
	sql = "INSERT INTO json (sid, date, agency, link, title, body, created) VALUES (%s, %s, %s, %s, %s, %s, NOW())";
	values = (sid, date, agency, link, title, body);

	# execute and commit
	cursor.execute(sql, values);
	db.commit();
	cursor.close();
	db.close();
	return;