Adding URL summary option to DocSum Gradio-UI (#1248)

Signed-off-by: okhleif-IL <omar.khleif@intel.com>
Co-authored-by: okhleif-IL <omar.khleif@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: lkk <33276950+lkk12014402@users.noreply.github.com>
Co-authored-by: chen, suyue <suyue.chen@intel.com>
Co-authored-by: WenjiaoYue <wenjiao.yue@intel.com>
This commit is contained in:
Mustafa
2024-12-18 18:49:03 -08:00
committed by GitHub
parent 89a7f9e001
commit 84a6a6e9bc
2 changed files with 60 additions and 1 deletions

View File

@@ -6,12 +6,13 @@ import base64
import json
import logging
import os
from urllib.parse import urlparse
import gradio as gr
import requests
import uvicorn
from fastapi import FastAPI
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, UnstructuredURLLoader
# Configure logging
logging.basicConfig(level=logging.INFO)
@@ -91,6 +92,42 @@ class DocSumUI:
base64_str = self.encode_file_to_base64(file)
return self.generate_summary(base64_str, document_type="video")
def is_valid_url(self, url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def read_url(self, url):
"""Read and process the content of a url.
Args:
url: The url to be read as a document.
Returns:
str: The content of the website or an error message if the url is unsupported.
"""
self.page_content = ""
logger.info(">>> Reading url: %s", url)
if self.is_valid_url(url=url):
os.environ["no_proxy"] = f"{os.environ.get('no_proxy', '')},{url}".strip(",")
try:
loader = UnstructuredURLLoader([url])
page = loader.load()
self.page_content = [content.page_content for content in page][0]
except Exception as e:
msg = f"There was an error trying to read '{url}' --> '{e}'\nTry adding the domain name to your `no_proxy` variable and try again. Example: example.com*"
logger.error(msg)
else:
msg = f"Invalid URL '{url}'. Make sure the link provided is a valid URL"
logger.error(msg)
return msg
return self.page_content
def generate_summary(self, doc_content, document_type="text"):
"""Generate a summary for the given document content.
@@ -201,6 +238,25 @@ class DocSumUI:
)
submit_btn.click(fn=self.generate_summary, inputs=[input_text], outputs=[generated_text])
with gr.Blocks() as url_ui:
# URL text UI
with gr.Row():
with gr.Column():
input_text = gr.TextArea(
label="Please paste a URL for summarization",
placeholder="Paste a URL for the information you need to summarize",
)
submit_btn = gr.Button("Generate Summary")
with gr.Column():
generated_text = gr.TextArea(
label="Text Summary", placeholder="Summarized text will be displayed here"
)
submit_btn.click(
lambda input_text: self.generate_summary(self.read_url(input_text)),
inputs=input_text,
outputs=generated_text,
)
# File Upload UI
file_ui = self.create_upload_ui(
label="Please upload a document (.pdf, .doc, .docx)",
@@ -232,6 +288,8 @@ class DocSumUI:
audio_ui.render()
with gr.TabItem("Upload Video"):
video_ui.render()
with gr.TabItem("Enter URL"):
url_ui.render()
return self.demo

View File

@@ -6,3 +6,4 @@ numpy==1.26.4
opencv-python==4.10.0.82
Pillow==10.3.0
pypdf
unstructured