Adding URL summary option to DocSum Gradio-UI (#1248)
Signed-off-by: okhleif-IL <omar.khleif@intel.com> Co-authored-by: okhleif-IL <omar.khleif@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: lkk <33276950+lkk12014402@users.noreply.github.com> Co-authored-by: chen, suyue <suyue.chen@intel.com> Co-authored-by: WenjiaoYue <wenjiao.yue@intel.com>
This commit is contained in:
@@ -6,12 +6,13 @@ import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
|
||||
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, UnstructuredURLLoader
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -91,6 +92,42 @@ class DocSumUI:
|
||||
base64_str = self.encode_file_to_base64(file)
|
||||
return self.generate_summary(base64_str, document_type="video")
|
||||
|
||||
def is_valid_url(self, url):
|
||||
try:
|
||||
result = urlparse(url)
|
||||
return all([result.scheme, result.netloc])
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def read_url(self, url):
|
||||
"""Read and process the content of a url.
|
||||
|
||||
Args:
|
||||
url: The url to be read as a document.
|
||||
|
||||
Returns:
|
||||
str: The content of the website or an error message if the url is unsupported.
|
||||
"""
|
||||
|
||||
self.page_content = ""
|
||||
|
||||
logger.info(">>> Reading url: %s", url)
|
||||
if self.is_valid_url(url=url):
|
||||
os.environ["no_proxy"] = f"{os.environ.get('no_proxy', '')},{url}".strip(",")
|
||||
try:
|
||||
loader = UnstructuredURLLoader([url])
|
||||
page = loader.load()
|
||||
self.page_content = [content.page_content for content in page][0]
|
||||
except Exception as e:
|
||||
msg = f"There was an error trying to read '{url}' --> '{e}'\nTry adding the domain name to your `no_proxy` variable and try again. Example: example.com*"
|
||||
logger.error(msg)
|
||||
else:
|
||||
msg = f"Invalid URL '{url}'. Make sure the link provided is a valid URL"
|
||||
logger.error(msg)
|
||||
return msg
|
||||
|
||||
return self.page_content
|
||||
|
||||
def generate_summary(self, doc_content, document_type="text"):
|
||||
"""Generate a summary for the given document content.
|
||||
|
||||
@@ -201,6 +238,25 @@ class DocSumUI:
|
||||
)
|
||||
submit_btn.click(fn=self.generate_summary, inputs=[input_text], outputs=[generated_text])
|
||||
|
||||
with gr.Blocks() as url_ui:
|
||||
# URL text UI
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
input_text = gr.TextArea(
|
||||
label="Please paste a URL for summarization",
|
||||
placeholder="Paste a URL for the information you need to summarize",
|
||||
)
|
||||
submit_btn = gr.Button("Generate Summary")
|
||||
with gr.Column():
|
||||
generated_text = gr.TextArea(
|
||||
label="Text Summary", placeholder="Summarized text will be displayed here"
|
||||
)
|
||||
submit_btn.click(
|
||||
lambda input_text: self.generate_summary(self.read_url(input_text)),
|
||||
inputs=input_text,
|
||||
outputs=generated_text,
|
||||
)
|
||||
|
||||
# File Upload UI
|
||||
file_ui = self.create_upload_ui(
|
||||
label="Please upload a document (.pdf, .doc, .docx)",
|
||||
@@ -232,6 +288,8 @@ class DocSumUI:
|
||||
audio_ui.render()
|
||||
with gr.TabItem("Upload Video"):
|
||||
video_ui.render()
|
||||
with gr.TabItem("Enter URL"):
|
||||
url_ui.render()
|
||||
|
||||
return self.demo
|
||||
|
||||
|
||||
@@ -6,3 +6,4 @@ numpy==1.26.4
|
||||
opencv-python==4.10.0.82
|
||||
Pillow==10.3.0
|
||||
pypdf
|
||||
unstructured
|
||||
|
||||
Reference in New Issue
Block a user