diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
index b300af428..d960977e3 100644
--- a/comps/dataprep/utils.py
+++ b/comps/dataprep/utils.py
@@ -41,6 +41,11 @@ from langchain_community.document_loaders import (
from langchain_community.llms import HuggingFaceEndpoint
from PIL import Image
+from comps import CustomLogger
+
+logger = CustomLogger("prepare_doc_util")
+logflag = os.getenv("LOGFLAG", False)
+
class TimeoutError(Exception):
pass
@@ -428,14 +433,51 @@ class Crawler:
if not headers:
headers = self.headers
while max_times:
- if not url.startswith("http") or not url.startswith("https"):
+ parsed_url = urlparse(url)
+ if not parsed_url.scheme:
url = "http://" + url
- print("start fetch %s...", url)
+ if logflag:
+ logger.info("start fetch %s..." % url)
try:
response = requests.get(url, headers=headers, verify=True)
if response.status_code != 200:
print("fail to fetch %s, response status code: %s", url, response.status_code)
else:
+ # Extract charset from the Content-Type header
+ content_type = response.headers.get("Content-Type", "").lower()
+ if "charset=" in content_type:
+ # Extract charset value from the content-type header
+ charset = content_type.split("charset=")[-1].strip()
+ response.encoding = charset
+ if logflag:
+ logger.info(f"Charset detected and set: {response.encoding}")
+ else:
+ import re
+
+ # Extract charset from the response HTML content
+ charset_from_meta = None
+ # Check for
+ match = re.search(r']+)["\']?', response.text, re.IGNORECASE)
+ if match:
+ charset_from_meta = match.group(1)
+ # Check for
+ if not charset_from_meta:
+ match = re.search(
+ r']+)["\']?',
+ response.text,
+ re.IGNORECASE,
+ )
+ if match:
+ charset_from_meta = match.group(1)
+ if charset_from_meta:
+ response.encoding = charset_from_meta
+ if logflag:
+ logger.info(f"Charset detected and set from meta tag: {response.encoding}")
+ else:
+ # Fallback to default encoding
+ response.encoding = "utf-8"
+ if logflag:
+ logger.info("Charset not specified, using default utf-8")
return response
except Exception as e:
print("fail to fetch %s, caused by %s", url, e)
@@ -540,8 +582,9 @@ def load_html_data(url):
main_content = all_text if main_content == "" else main_content
main_content = main_content.replace("\n", "")
main_content = main_content.replace("\n\n", "")
- main_content = uni_pro(main_content)
main_content = re.sub(r"\s+", " ", main_content)
+ if logflag:
+ logger.info("main_content=[%s]" % main_content)
return main_content