Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
5a5bc46
suy cat map updated
ugeshdg Mar 4, 2025
60e3825
authentication updated
ugeshdg Mar 8, 2025
b64227b
celeray updated
ugeshdg Mar 9, 2025
fa254dd
beat time updated
ugeshdg Mar 10, 2025
0cba38c
beat time updated
ugeshdg Mar 10, 2025
4789744
request sent api updated
ugeshdg Mar 11, 2025
dd4551c
Merge branch 'platform_prod' into platform_dev
ugeshdg Mar 11, 2025
4c90fe5
Merge pull request #839 from digitalgreenorg/platform_dev
ugeshdg Mar 11, 2025
e8fc0ea
scheduler updated to 1hr
ugeshdg Mar 12, 2025
77b7da2
scheduler updated to 1hr
ugeshdg Mar 12, 2025
7627a80
recalled added
ugeshdg Mar 17, 2025
ea7738b
req updated
ugeshdg Mar 17, 2025
eabdb5e
youtube cookies
ugeshdg Mar 18, 2025
0c92184
delay removed
ugeshdg Mar 18, 2025
e7cfd26
yoputube api fixed
ugeshdg Mar 18, 2025
013374b
yotube updated
ugeshdg Mar 18, 2025
7865dee
Merge pull request #840 from digitalgreenorg/platform_dev
ugeshdg Mar 19, 2025
d587604
api updated
ugeshdg Mar 25, 2025
d207307
api feature updated
ugeshdg Mar 25, 2025
d9ee231
Merge pull request #841 from digitalgreenorg/platform_dev
ugeshdg Mar 25, 2025
604ed43
api features updated
ugeshdg Mar 26, 2025
2c8f696
Merge pull request #842 from digitalgreenorg/platform_dev
ugeshdg Mar 26, 2025
adb6684
jp library added
ugeshdg Mar 27, 2025
1585c5a
api updated
ugeshdg Mar 27, 2025
b05d1bc
Merge pull request #843 from digitalgreenorg/platform_dev
ugeshdg Mar 27, 2025
8e9e3d0
dropbox api updated
ugeshdg Mar 27, 2025
0a86a51
Merge pull request #844 from digitalgreenorg/platform_dev
ugeshdg Mar 27, 2025
2298bf2
wbsite and api vector creation updated
ugeshdg Apr 1, 2025
95faeca
Merge pull request #845 from digitalgreenorg/platform_dev
ugeshdg Apr 1, 2025
7144fcf
docker file updated
ugeshdg Apr 7, 2025
130f95a
docker file updated
ugeshdg Apr 22, 2025
eb27c5a
removed unnecessary files
ugeshdg May 19, 2025
c62f374
Merge pull request #847 from digitalgreenorg/platform_dev
ugeshdg May 19, 2025
5adb912
entry.sh added
ugeshdg May 19, 2025
1ed87e3
entry.sh added
ugeshdg May 19, 2025
a77cfed
Merge pull request #848 from digitalgreenorg/platform_dev
ugeshdg May 19, 2025
f1b43a1
dependensices updated
ugeshdg May 20, 2025
a6bf019
Merge pull request #849 from digitalgreenorg/platform_dev
ugeshdg May 21, 2025
11cd0c7
local storage code updadtae
ugeshdg May 21, 2025
3558d7c
Merge pull request #850 from digitalgreenorg/platform_dev
ugeshdg May 21, 2025
7e6f12f
Update views.py
ugeshdg Jun 3, 2025
91b5585
Update views.py
ugeshdg Jun 3, 2025
03fb746
bot login api added
ugeshdg Jun 3, 2025
3547980
bot login api added
ugeshdg Jun 3, 2025
c74ea6c
get_content api updated
ugeshdg Jun 10, 2025
524c046
file url vaildated
ugeshdg Jul 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 41 additions & 29 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,41 @@
FROM python:3.11-slim

# Install dependencies
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libsasl2-dev curl gcc libldap2-dev libpq-dev python3-dev\

&& DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker} \
&& mkdir -p $DOCKER_CONFIG/cli-plugins \
&& curl -SL https://github.com/docker/compose/releases/download/v2.2.3/docker-compose-linux-x86_64 -o $DOCKER_CONFIG/cli-plugins/docker-compose \
&& chmod +x $DOCKER_CONFIG/cli-plugins/docker-compose

# Set the working directory and copy the application files
WORKDIR /datahub
COPY . /datahub

# Upgrade pip and install required Python packages
RUN python -m pip install --upgrade pip \
&& pip install python-ldap==3.3.1 \
&& pip install --upgrade pyopenssl \
&& pip install -r requirements.txt

# Set environment variables
ENV PYTHONUNBUFFERED 1

# Expose port 8000 for the Django app
EXPOSE 8000

# Command to run the Django development server
CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]

# ------------ Stage 1: Build dependencies -------------
FROM python:3.11-slim as builder

# Install system build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential gcc curl libpq-dev libsasl2-dev libldap2-dev python3-dev \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# Create a virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Copy and install Python dependencies
COPY requirements.txt .
RUN pip install --upgrade pip \
&& pip install --no-cache-dir python-ldap==3.3.1 \
&& pip install --no-cache-dir pyopenssl \
&& pip install --no-cache-dir -r requirements.txt

# ------------ Stage 2: Final minimal image -------------
FROM python:3.11-slim

WORKDIR /app

# Install runtime dependencies only
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg libsm6 libxext6 libsasl2-dev libldap2-dev libpq-dev \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# Copy virtualenv and source code from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Copy app source code
COPY . .

ENV PYTHONUNBUFFERED 1
EXPOSE 8000

CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]

235 changes: 0 additions & 235 deletions Restructured_Agricultural_Data_with_Descriptions.csv

This file was deleted.

15 changes: 15 additions & 0 deletions accounts/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,22 @@ def reset(self, request, *args, **kwargs):
status=status.HTTP_400_BAD_REQUEST,
)

@action(detail=False, methods=["post"])
def bot_login(self, request, *args, **kwargs):
"""POST method: to save a newly registered user"""
email=request.data.get("email")
user_obj = User.objects.filter(email=email)
user = user_obj.first()
if not user:
return Response(
{"email": "User not registered"},
status=status.HTTP_401_UNAUTHORIZED,
)
serializer = UserCreateSerializer(user)
return Response(serializer.data)



@permission_classes([])
class ResendOTPViewset(GenericViewSet):
"""ResendOTPViewset for users to register"""
Expand Down
6 changes: 1 addition & 5 deletions ai/vector_db_builder/load_audio_and_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,8 @@ def generate_transcriptions_summary(self, url):
'format': 'bestaudio/best',
'outtmpl': local_temp_path,
'quiet': False,
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
},
'cookiefile': "../../youtube_cookies.txt", # Path to your exported cookies
'cookiefile': "ai/vector_db_builder/youtube_cookies.txt", # Path to your exported cookies
}

try:
# Download the audio
with YoutubeDL(ydl_opts) as ydl:
Expand Down
14 changes: 13 additions & 1 deletion ai/vector_db_builder/load_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,22 @@ def temporary_file(suffix=""):

class LoadDocuments:

def _get_full_url(self, file):
"""Construct full URL for media files using DATAHUB_SITE environment variable."""
if file.startswith('/media'):
base_url = os.environ.get("DATAHUB_SITE", "http://localhost:8000")
return base_url + file
return file

def load_by_file_extension(self, file):
if file.endswith(".pdf"):
LOGGING.info(f"pdf file loader started for file: {file}")
return PyMuPDFLoader(file.replace('http://localhost:8000/', "")).load(), 'pdf'
# Handle media files with DATAHUB_SITE environment variable
if file.startswith('/media'):
full_url = self._get_full_url(file)
return PyMuPDFLoader(full_url).load(), 'pdf'
else:
return PyMuPDFLoader(file).load(), 'pdf'
elif file.endswith(".csv"):
with temporary_file(suffix=".csv") as temp_pdf_path:
response = requests.get(file)
Expand Down
33 changes: 23 additions & 10 deletions ai/vector_db_builder/load_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,26 @@ def process_website_content(self, url):
return "", ""

def aggregate_links_content(self, links, doc_text):
def fetch_content(link):
main_content, web_links = self.process_website_content(link)
return main_content, link

with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(fetch_content, link) for link in set(links)]
for future in as_completed(futures):
main_content, link = future.result()
doc_text += f" Below content related to link: {link} \n"+ main_content
return doc_text
# def fetch_content(link):
# main_content, web_links = self.process_website_content(link)
# return main_content, link
for link in set(links):
main_content, link = self.process_website_content(link)
doc_text += f" Below content related to link: {link} \n"+ main_content

return doc_text


# import asyncio

# async def fetch_content(self, link):
# main_content, web_links = await self.process_website_content(link)
# return main_content, link

# async def aggregate_links_content_async(self, links, doc_text):
# tasks = [self.fetch_content(link) for link in set(links)]
# results = await asyncio.gather(*tasks)

# for main_content, link in results:
# doc_text += f" Below content related to link: {link} \n" + main_content
# return doc_text
16 changes: 12 additions & 4 deletions ai/vector_db_builder/vector_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,16 @@ def load_documents(url, file, doc_type, resource_file, transcription=""):
try:

if doc_type == 'api':
absolute_path = os.path.join(settings.MEDIA_ROOT, file.replace("/media/", ''))
loader = JSONLoader(file_path=absolute_path, jq_schema='.', text_content=False)
return loader.load(), "completed"
with temporary_file(suffix=".json") as temp_pdf_path:
response = requests.get(file)
if response.status_code != 200:
return response.txt, "failed"
with open(temp_pdf_path, 'wb') as f:
f.write(response.content)
LOGGING.info("absolute_path of the api file {file}")
loader = JSONLoader(file_path=temp_pdf_path, jq_schema='.', text_content=False)
return loader.load(), "completed"

elif doc_type in ['youtube', 'pdf', 'website', 'file', 'dropbox', 's3', 'google_drive', 'dropbox']:
with temporary_file(suffix=".pdf") as temp_pdf_path:
if doc_type == 'youtube':
Expand All @@ -131,6 +138,7 @@ def load_documents(url, file, doc_type, resource_file, transcription=""):
doc_text = ""
web_site_loader = WebsiteLoader()
main_content, web_links = web_site_loader.process_website_content(url)
# doc_text = web_site_loader.aggregate_links_content(web_links, doc_text)
doc_text = web_site_loader.aggregate_links_content(web_links, doc_text)
all_content = main_content + "\n" + doc_text
build_pdf(all_content.replace("\n", " "), temp_pdf_path)
Expand All @@ -151,7 +159,7 @@ def load_documents(url, file, doc_type, resource_file, transcription=""):
LOGGING.error(f"Unsupported input type: {doc_type}")
return f"Unsupported input type: {doc_type}", "failed"
except Exception as e:
LOGGING.error(f"Faild lo load the documents: {str(e)}")
LOGGING.error(f"Faild lo load the documents: {str(e)}", exc_info=True)
return str(e), "failed"

def split_documents(documents, chunk_size, chunk_overlap):
Expand Down
26 changes: 26 additions & 0 deletions ai/vector_db_builder/youtube_cookies.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Netscape HTTP Cookie File
# This file is generated by yt-dlp. Do not edit.

.youtube.com TRUE / TRUE 1762843691 LOGIN_INFO AFmmF2swRAIgbK9lgXpJD3G9KOCwWfSE40Q2IuLTvcK0FFFwDcUZhgICIGmbRPPnvnAuVsgDjRofmJp4BT3ilU8jgKpi3FS8rV33:QUQ3MjNmeWJHX1lvdThrYWgxaThpQVY4WUNjNl9zdGhydzEzRWppbWdIb253Mko2R1ZjUG1MaXBiTWlGdUVBbWhnOE9HTWdDR3JHSHNfLWdsbUlJY2g5aERlTE1Ca3d3LTRXUGt0VHhscElxdXFiZnJVdW51eW8wNWkyck4tdkx6QjZjLWFUVVY4ZjFrVEtYRVdMeXFhUUJHbzY0NDAwUkNR
.youtube.com TRUE / FALSE 0 PREF f6=40000000&f7=4100&tz=UTC&f4=4000000&f5=30000&hl=en
.youtube.com TRUE / FALSE 1776835393 SID g.a000uwj8nKnF2HIVNJiT6cqltlN6pMEx8Gbp11lOXLK1jg7CfK-DLDpWG08iHOzh3lUSH5XmwwACgYKAaASARASFQHGX2MiIntNHAaYwbENwMgD6G6jhhoVAUF8yKpZDVm7I7-lUa5mMRQMeIv20076
.youtube.com TRUE / TRUE 1776835393 __Secure-1PSID g.a000uwj8nKnF2HIVNJiT6cqltlN6pMEx8Gbp11lOXLK1jg7CfK-DUUl4XYF9e646FgeW9r9dmAACgYKAVkSARASFQHGX2MipGQr-JgjPZ3TagNGwThFjRoVAUF8yKo5dtjIVAvXCAm5k07fgZSO0076
.youtube.com TRUE / TRUE 1776835393 __Secure-3PSID g.a000uwj8nKnF2HIVNJiT6cqltlN6pMEx8Gbp11lOXLK1jg7CfK-DhsYrX1l-SwtgXCFxCuz7vwACgYKARMSARASFQHGX2Mi-wT7x_p3-rXdE8wcHQilpRoVAUF8yKqxu7DD-URg5qXQoIoRQzNx0076
.youtube.com TRUE / FALSE 1776835393 HSID A78hbaZg_NmFGRYBQ
.youtube.com TRUE / TRUE 1776835393 SSID AFV_ARvy3ptjFGQo-
.youtube.com TRUE / FALSE 1776835393 APISID EVGC0fZ6HmXY_hyd/AOFSt7oHi6IX9oe2K
.youtube.com TRUE / TRUE 1776835393 SAPISID D9LAREU27e0jzJ0B/AiXjnBherq8yUuI8_
.youtube.com TRUE / TRUE 1776835393 __Secure-1PAPISID D9LAREU27e0jzJ0B/AiXjnBherq8yUuI8_
.youtube.com TRUE / TRUE 1776835393 __Secure-3PAPISID D9LAREU27e0jzJ0B/AiXjnBherq8yUuI8_
.youtube.com TRUE / TRUE 1773833606 __Secure-1PSIDTS sidts-CjIB7pHptZVJltiOpcyL-ggS-aVrYwNwT9gBKgfMCSzBr0DKL-ubqjKlLiUqZFzchSnO1BAA
.youtube.com TRUE / TRUE 1773833606 __Secure-3PSIDTS sidts-CjIB7pHptZVJltiOpcyL-ggS-aVrYwNwT9gBKgfMCSzBr0DKL-ubqjKlLiUqZFzchSnO1BAA
.youtube.com TRUE / FALSE 1773833939 SIDCC AKEyXzX62tnFTxIxoB7EPWtFypEZ3jh_KHYBdR91eeUegIngOg3A27fmjlvrS5eELyWiSAhZz4g
.youtube.com TRUE / TRUE 1773833939 __Secure-1PSIDCC AKEyXzVhGAjPQRBtGctfEY9oIaqEQy2srXOk3OYLBGeuQ5oltKc0TUtEwQSYRcgCfqW7oaYRFZA
.youtube.com TRUE / TRUE 1773833939 __Secure-3PSIDCC AKEyXzXiSf7CyVgyfvqbJP20PSO_F3sW9PKg8WJ4W8qC0JGX1odFLYSVXu-OMOaBjTnDLw7c7RU
.youtube.com TRUE / TRUE 1757849938 VISITOR_PRIVACY_METADATA CgJJThIEGgAgZQ%3D%3D
.youtube.com TRUE / TRUE 1757849938 VISITOR_INFO1_LIVE 0dRpFqduvCM
.youtube.com TRUE / TRUE 0 YSC 92vz5nwwhB0
.youtube.com TRUE / TRUE 1757834926 __Secure-ROLLOUT_TOKEN CPKggL-Ln6mhgwEQ3LXlhKCzigMY062W142TjAM%3D
.youtube.com TRUE / TRUE 1757849938 YT_DEVICE_MEASUREMENT_ID W1LSi4k=
.youtube.com TRUE / TRUE 1805369938 __Secure-YT_TVFAS t=475422&s=2
.youtube.com TRUE / TRUE 1757849938 DEVICE_INFO ChxOelE0TXpFeE1qWTJORGt4TVRBNU9EQXhOZz09ENK25b4GGNK25b4G
Binary file added celerybeat-schedule.dir
Binary file not shown.
Binary file added celerybeat-schedule.pag
Binary file not shown.
3 changes: 3 additions & 0 deletions core/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')

app = Celery('datahub-api')
# from datahub import celery_tasks

# Load task modules from all registered Django apps.
app.config_from_object('django.conf:settings', namespace='CELERY')
app.conf.broker_connection_retry = True
app.conf.broker_connection_retry_on_startup = True
app.conf.worker_cancel_long_running_tasks_on_connection_loss = True
app.autodiscover_tasks()


45 changes: 32 additions & 13 deletions core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

collections.Callable = collections.abc.Callable
from corsheaders.defaults import default_headers
from celery.schedules import crontab

# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
Expand Down Expand Up @@ -62,6 +63,7 @@
"drf_spectacular_sidecar",
"django_nose",
"django_filters",
"django_celery_beat",
# custom apps
"accounts",
"datahub",
Expand Down Expand Up @@ -174,21 +176,22 @@
STATIC_ROOT = os.path.join(BASE_DIR, "static")
STATIC_URL = "static/"

AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID",'')
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY",'')
AWS_STORAGE_BUCKET_NAME = os.environ.get("AWS_STORAGE_BUCKET_NAME",'')
AWS_S3_REGION_NAME = os.environ.get("AWS_S3_REGION_NAME",'') # e.g., 'us-east-1'
AWS_S3_SIGNATURE_VERSION = 's3v4'
if os.environ.get("STORAGE", "s3") == "s3":

# Django Storages settings
DEFAULT_FILE_STORAGE = 'storages.backends.s3boto3.S3Boto3Storage'
AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID",'')
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY",'')
AWS_STORAGE_BUCKET_NAME = os.environ.get("AWS_STORAGE_BUCKET_NAME",'')
AWS_S3_REGION_NAME = os.environ.get("AWS_S3_REGION_NAME",'') # e.g., 'us-east-1'
AWS_S3_SIGNATURE_VERSION = 's3v4'

AWS_S3_FILE_OVERWRITE = False
AWS_DEFAULT_ACL = None
AWS_QUERYSTRING_AUTH = False
# URL of your S3 bucket
AWS_S3_CUSTOM_DOMAIN = f'{AWS_STORAGE_BUCKET_NAME}.s3.amazonaws.com'
if os.environ.get("STORAGE", "s3") == "s3":
# Django Storages settings
DEFAULT_FILE_STORAGE = 'storages.backends.s3boto3.S3Boto3Storage'
AWS_S3_CUSTOM_DOMAIN = f'{AWS_STORAGE_BUCKET_NAME}.s3.amazonaws.com'

AWS_S3_FILE_OVERWRITE = False
AWS_DEFAULT_ACL = None
AWS_QUERYSTRING_AUTH = False
# URL of your S3 bucket
MEDIA_URL = f'https://{AWS_S3_CUSTOM_DOMAIN}/'
else:
MEDIA_ROOT = os.path.join(BASE_DIR, "media")
Expand Down Expand Up @@ -440,10 +443,26 @@
CELERY_BROKER_URL = os.environ.get("CELERY_BROKER_URL",'')
FILE_UPLOAD_MAX_MEMORY_SIZE = 25 * 1024 * 1024 # 25 Mb limit
CELERY_BROKER_URL = f'redis://{os.environ.get("REDIS_SERVICE", "loaclhost")}:6379/0'
CELERY_RESULT_BACKEND = f'redis://{os.environ.get("REDIS_SERVICE", "loaclhost")}:6379/0'

# SMTP server configuration


SMTP_SERVER = os.environ.get("SMTP_SERVER",'') # e.g., 'smtp.gmail.com' for Gmail
SMTP_PORT = 587 # or 465 for SSL
SMTP_USER = os.environ.get("SMTP_USER",'')
SMTP_PASSWORD = os.environ.get("SMTP_PASSWORD",'')



CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = 'UTC'
CELERY_BEAT_SCHEDULE = {
'fetch_dataset_for_all_files': {
'task': 'core.utils.fetch_data_for_all_datasets',
'schedule': crontab(minute=0, hour=0), # Daily at midnight minite=0 hour=0
# 'schedule': crontab(minute='*/1')
},
}
Loading