Jump to content

Recommended Posts

Posted

Readability is a Python Library that emulates the "Reading Mode" used by Browsers, ie it takes an input URL, and returns the simplified HTML. It removes headers, footers and scripts.

I made a simple server out of it, which takes CLI arguments for server IP and server Port to start the server. Default IP and port are 127.0.0.1:8900
Example requests that can be made:

http://127.0.0.1:8900?url=https://google.com&output_type=TITLE

http://127.0.0.1:8900?url=https://google.com&output_type=SHORT_TITLE

http://127.0.0.1:8900?url=https://google.com&output_type=CONTENT

http://127.0.0.1:8900?url=https://google.com&output_type=SUMMARY

http://127.0.0.1:8900/health (to check if the server is running)

import http.server
import requests
import re
import logging
import sys
from readability import Document

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

class RequestHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        # Log the request
        logging.info(f"Received request: {self.path}")

        # Regular expression to match URLs
        URL_REGEX = re.compile(r"^https?://.+$")

        # Allowed output types
        ALLOWED_OUTPUT_TYPES = ["TITLE", "SHORT_TITLE", "CONTENT", "SUMMARY"]

        if self.path == "/health":
            # This is a health check request, return a 200 status code
            self.send_response(200)
            self.send_header("Content-type", "text/plain")
            self.send_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0")
            self.end_headers()
            self.wfile.write(b"OK")
        else:
            # Parse the query string to get the URL and output type
            query_string = self.path[2:]
            query_params = query_string.split("&")
            url = query_params[0].split("=")[1]
            output_type = query_params[1].split("=")[1]

            # Validate the input
            if not URL_REGEX.match(url):
                # URL is invalid
                self.send_response(400)
                self.send_header("Content-type", "text/plain")
                self.end_headers()
                self.wfile.write(b"Invalid URL")
            elif output_type not in ALLOWED_OUTPUT_TYPES:
                # Output type is invalid
                self.send_response(400)
                self.send_header("Content-type", "text/plain")
                self.send_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0")
                self.end_headers()
                self.wfile.write(b"Invalid output type")
            else:
                # Input is valid, proceed with processing the request
                try:
                    doc = Document(requests.get(url).content)
                    output = {
                        "TITLE": doc.title(),
                        "SHORT_TITLE": doc.short_title(),
                        "CONTENT": doc.content(),
                        "SUMMARY": doc.summary()
                    }[output_type]

                    # Send the response
                    self.send_response(200)
                    self.send_header("Content-type", "text/plain")
                    self.end_headers()
                    self.wfile.write(output.encode())
                except Exception as e:
                    # Log the error
                    logging.error(f"Error: {e}")
                    # Return an error message to the client
                    self.send_response(500)
                    self.send_header("Content-type", "text/plain")
                    self.end_headers()
                    self.wfile.write(b"An error occurred while processing the request")

# Get the server IP and port from the command line arguments
server_ip = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1"
server_port = int(sys.argv[2]) if len(sys.argv) > 2 else 8900

# Create the server and run it indefinitely
server_address = (server_ip, server_port)
httpd = http.server.HTTPServer(server_address, RequestHandler)

# Log an info message when the server starts
logging.info("Server started")

httpd.serve_forever()

Note: make sure you have the readability library https://github.com/buriy/python-readability before using this

pip install readability-lxml

 

Posted

Example use cases:
 

Wordpress Blog Post
https://lmilosis.wordpress.com/2020/01/26/19/
http://127.0.0.1:8900/?url=https://lmilosis.wordpress.com/2020/01/26/19/&output_type=SUMMARY

News Article
https://us.cnn.com/2023/01/04/weather/severe-storm-tornado-threat-south-wednesday/index.html
http://127.0.0.1:8900/?url=https://us.cnn.com/2023/01/04/weather/severe-storm-tornado-threat-south-wednesday/index.html&output_type=SUMMARY

It doesn't do too welll with JS heavy sites. You may want to edit the script to take HTML source as input instead if you're using another tool to scrape the HTML.

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
×
×
  • Create New...