Search the Community
Showing results for tags 'readability'.
-
Readability is a Python Library that emulates the "Reading Mode" used by Browsers, ie it takes an input URL, and returns the simplified HTML. It removes headers, footers and scripts. I made a simple server out of it, which takes CLI arguments for server IP and server Port to start the server. Default IP and port are 127.0.0.1:8900 Example requests that can be made: http://127.0.0.1:8900?url=https://google.com&output_type=TITLE http://127.0.0.1:8900?url=https://google.com&output_type=SHORT_TITLE http://127.0.0.1:8900?url=https://google.com&output_type=CONTENT http://127.0.0.1:8900?url=https://google.com&output_type=SUMMARY http://127.0.0.1:8900/health (to check if the server is running) import http.server import requests import re import logging import sys from readability import Document # Set up logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") class RequestHandler(http.server.BaseHTTPRequestHandler): def do_GET(self): # Log the request logging.info(f"Received request: {self.path}") # Regular expression to match URLs URL_REGEX = re.compile(r"^https?://.+$") # Allowed output types ALLOWED_OUTPUT_TYPES = ["TITLE", "SHORT_TITLE", "CONTENT", "SUMMARY"] if self.path == "/health": # This is a health check request, return a 200 status code self.send_response(200) self.send_header("Content-type", "text/plain") self.send_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0") self.end_headers() self.wfile.write(b"OK") else: # Parse the query string to get the URL and output type query_string = self.path[2:] query_params = query_string.split("&") url = query_params[0].split("=")[1] output_type = query_params[1].split("=")[1] # Validate the input if not URL_REGEX.match(url): # URL is invalid self.send_response(400) self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(b"Invalid URL") elif output_type not in ALLOWED_OUTPUT_TYPES: # Output type is invalid self.send_response(400) self.send_header("Content-type", "text/plain") self.send_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0") self.end_headers() self.wfile.write(b"Invalid output type") else: # Input is valid, proceed with processing the request try: doc = Document(requests.get(url).content) output = { "TITLE": doc.title(), "SHORT_TITLE": doc.short_title(), "CONTENT": doc.content(), "SUMMARY": doc.summary() }[output_type] # Send the response self.send_response(200) self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(output.encode()) except Exception as e: # Log the error logging.error(f"Error: {e}") # Return an error message to the client self.send_response(500) self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(b"An error occurred while processing the request") # Get the server IP and port from the command line arguments server_ip = sys.argv[1] if len(sys.argv) > 1 else "127.0.0.1" server_port = int(sys.argv[2]) if len(sys.argv) > 2 else 8900 # Create the server and run it indefinitely server_address = (server_ip, server_port) httpd = http.server.HTTPServer(server_address, RequestHandler) # Log an info message when the server starts logging.info("Server started") httpd.serve_forever() Note: make sure you have the readability library https://github.com/buriy/python-readability before using this pip install readability-lxml
- 1 reply
-
- readability
- python
-
(and 1 more)
Tagged with: