Python #WebScraping #Django
Firstly my name is Daniel-Caleb Cheruiyot Ronoh and am a mobile and backend developer as well as a student.
Recently, I tackled a project: creating APIs that scrape job data from a popular job site in Kenya, filter the necessary information, and store it in a database for easy access through various endpoints. This journey has been both challenging and enlightening, and it’s precisely the kind of experience I look forward to expanding upon during the HNG Internship.
My goal was to create an efficient solution that scrapes job data, filters out the essential information, and makes it easily accessible through a RESTful API built with Django and the Django REST framework.
Step 1: Setting Up Django and Django REST Framework
Next, I needed to set up a Django project and integrate the Django REST framework. I created a new Django project and app, then installed the necessary packages.
cd job_scraper
django-admin startapp jobs
pip install djangorestframework
I then updated the ‘settings.py’ to include the REST framework and the new app.
…
‘rest_framework’,
‘jobs’,
]
Step 2: Scraping Job Data
The first challenge was to scrape data from the job site. I chose Python for its robust libraries such as BeautifulSoup and Requests, which make web scraping straightforward.
Step 3: Creating the Database Models
I created a model to store the job information in the models.py file of the jobs app.
# Create your models here.
from django.conf import settings
from django.db.models.signals import post_save
from django.dispatch import receiver
from rest_framework.authtoken.models import Token
@receiver(post_save, sender=settings.AUTH_USER_MODEL)
def create_auth_token(sender, instance=None, created=False, **kwargs):
if created:
Token.objects.create(user=instance)
class JobFunctions(models.Model):
jobFunction = models.TextField(max_length=100)
class JobIndustries(models.Model):
jobIndustries = models.TextField(max_length=100)
class JobLocation(models.Model):
jobLocation = models.TextField(max_length=100)
class JobImages(models.Model):
jobImages = models.URLField(max_length=1000)
class Jobs(models.Model):
job_title =models.CharField(max_length=1000)
scraped_date = models.CharField(max_length=1000)
job_link = models.URLField(max_length=5000)
Job_Image = models.ForeignKey(JobImages, related_name=’Job_Images’, on_delete=models.CASCADE)
Job_Function=models.ForeignKey(JobFunctions, related_name=’Job_Functions’, on_delete=models.CASCADE)
Job_Industries=models.ForeignKey(JobIndustries, related_name=’Job_Industries’, on_delete=models.CASCADE)
Job_Location=models.ForeignKey(JobLocation, related_name=’Job_Location’, on_delete=models.CASCADE)
Job_Details = models.TextField(max_length=1000, default=”)
def __str__(self):
return self.scraped_date
class JobDetails(models.Model):
job = models.ForeignKey(Jobs, related_name=’job_details’, on_delete=models.CASCADE)
details=models.TextField(max_length=20000)
bold = models.BooleanField(default=False)
I then ran the migrations to create the table in the database.
python manage.py makemigrations
python manage.py migrate
Step 4: Scraping Job Data
from .models import Jobs, JobFunctions, JobIndustries, JobLocation, JobDetails, JobImages
from rest_framework import viewsets
from .serializers import JobSerializer, JobFunctionSerializer, JobIndustriesSerializer, JobLocationSerializer, JobDetailsSerializer, JobImagesSerializer
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
link = ‘https://www.brightermonday.co.ke/jobs’
page = requests.get(link)
if page.status_code == 200:
soup = BeautifulSoup(page.content, ‘html.parser’)
# DROPDOWN MENU STARTS HERE !!!
# SELECTION FOR JOB FUNCTIONS
select_functions = soup.find(‘select’, class_=”w-full h-10 pl-2 text-gray-500 rounded-md border border-gray-300 hover:border-gray-400 focus:border-gray-400 placeholder-gray-400 focus:placeholder-gray-900 mb-3 w-full md:mb-0 md:mr-3″)
options_functions = select_functions.find_all(‘options’)
for option in options_functions:
functions = JobFunctions()
functions.jobFunction=option.get_Text()
functions.save()
# SELECTION FOR JOB INDUSTRIES
select_industries = soup.find(‘select’, class_=”w-full h-10 pl-2 text-gray-500 rounded-md border border-gray-300 hover:border-gray-400 focus:border-gray-400 placeholder-gray-400 focus:placeholder-gray-900 mb-3 w-full md:mb-0 md:mr-3″)
options_industries = select_industries.find_all(‘options’)
for option in options_industries:
industries = JobIndustries()
industries.jobIndustries=option.get_Text()
industries.save()
# SELECTION FOR JOB LOCAIONS
select_functions = soup.find(‘select’, class_=”w-full h-10 pl-2 text-gray-500 rounded-md border border-gray-300 hover:border-gray-400 focus:border-gray-400 placeholder-gray-400 focus:placeholder-gray-900 mb-3 w-full md:mb-0 md:mr-3″)
options_locations = select_functions.find_all(‘options’)
for option in options_locations:
location = JobLocation()
location.jobLocation=option.get_Text()
location.save()
# BASIC INFO —JOB TITLE, JOB LINK, JOB DATE
divs = soup.find_all(‘div’,class_=”mx-5 md:mx-0 flex flex-wrap col-span-1 mb-5 bg-white rounded-lg border border-gray-300 hover:border-gray-400 focus-within:ring-2 focus-within:ring-offset-2 focus-within:ring-gray-500″)
for job in divs:
save=Jobs()
job_title = job.find(‘div’,class_=”flex items-center”).find(‘p’,class_=’text-lg font-medium break-words text-link-500′).get_text().strip()
job_link = job.find(‘div’, class_=”flex items-center”).find(‘a’, class_=’relative mb-3 text-lg font-medium break-words focus:outline-none metrics-apply-now text-link-500 text-loading-animate’)[‘href’]
dates = job.find(‘div’,class_=”flex flex-row items-start items-center px-5 py-3 w-full border-t border-gray-300″).find(‘p’, class_=’ml-auto text-sm font-normal text-gray-700 text-loading-animate’).get_text().strip()
job_image = job.find(‘img’)
if job_image:
src = job_image.get(‘src’)
else:
continue
if dates is not None:
save.scraped_date=dates
# /* Job Functions, Details and Summary
# /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ */
job_response = requests.get(job_link)
job_soup = BeautifulSoup(job_response .content, ‘html.parser’)
Job_function_name = job_soup.find(‘div’,class_=’flex flex-wrap justify-start pt-5 pb-2 px-4 w-full border-b border-gray-300 md:flex-nowrap md:px-5′).find(‘div’,class_=’w-full text-gray-500′).find(‘h2′,class_=’text-sm font-normal’).find(‘a’).get_text(strip=True)
job_search = job_soup.find(‘div’, class_=’mt-3′)
Job_location_name = job_search.find(‘a’, class_=”text-sm font-normal px-3 rounded bg-brand-secondary-50 mr-2 mb-3 inline-block”).get_text(strip=True)
industry_search = job_soup.find(‘div’, class_=’w-full text-gray-500′)
Job_industries_name = industry_search.find_all(‘div’)[1].find(‘a’, class_=’text-sm font-normal px-3 rounded bg-brand-secondary-50 mr-2 mb-3 inline-block’).get_text(strip=True)
jobFunction, _ = JobFunctions.objects.get_or_create(jobFunction=Job_function_name)
jobIndustries, _ = JobIndustries.objects.get_or_create(jobIndustries=Job_industries_name)
jobLocation, _ = JobLocation.objects.get_or_create(jobLocation=Job_location_name)
job_image = JobImages(jobImages=src)
job_image.save()
new_job = Jobs(
job_title=job_title,
scraped_date=dates,
job_link =job_link,
Job_Function = jobFunction,
Job_Industries = jobIndustries,
Job_Location = jobLocation,
Job_Image = job_image
)
new_job.save()
# HERE WE SCRAP THE JOB DETAILS NESTED IN THE JOB LINK !!!
jb_summary = job_soup.find(‘div’, class_=’py-5 px-4 border-b border-gray-300 md:p-5′)
if jb_summary.find(‘h3’).get_text():
description=JobDetails()
description.job=new_job
description.details=jb_summary.find(‘h3’).get_text()
description.save()
if jb_summary.find(‘p’).get_text():
descriptio=JobDetails()
description.job=new_job
description.details=jb_summary.find(‘p’).get_text()
description.save()
qualification = jb_summary.find(‘ul’)
if qualification:
qualifications = qualification.find_all(‘li’)
for requirements in qualifications:
description = JobDetails()
description.job = new_job
description.details=requirements.get_text()
description.save()
job_info = job_soup.find(‘div’, class_=’text-sm text-gray-500′)
for info in job_info:
bold_tag =info.find(‘b’)
content=info.get_text()
if bold_tag:
job_detail = JobDetails(job=new_job, details=content,bold=True)
else:
job_detail = JobDetails(job=new_job, details=content,bold=False)
job_detail.save()
next_info = info.find_next_sibling()
if next_info and next_info.name == ‘ul’:
ul = info.find_next_sibling(‘ul’)
if ul:
cont1 = ”
for li in ul.find_all(‘li’):
cont1 = li.text.strip()
content = cont1
job_detail1 = JobDetails(job=new_job, details=content)
job_detail1.save()
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework import authentication, permissions
from django.contrib.auth.models import User
class ListUsers(APIView):
“””
View to list all users in the system.
* Requires token authentication.
* Only admin users are able to access this view.
“””
authentication_classes = [authentication.TokenAuthentication]
permission_classes = [permissions.IsAuthenticated]
def get(self, request, format=None):
“””
Return a list of all users.
“””
usernames = [user.username for user in User.objects.all()]
return Response(usernames)
from rest_framework.authtoken.views import ObtainAuthToken
from rest_framework.authtoken.models import Token
from rest_framework.response import Response
class CustomAuthToken(ObtainAuthToken):
def post(self, request, *args, **kwargs):
serializer = self.serializer_class(data=request.data, context={‘request’: request})
serializer.is_valid(raise_exception=True)
user = serializer.validated_data[‘user’]
token, created = Token.objects.get_or_create(user=user)
return Response({
‘token’: token.key,
‘user_id’: user.pk,
’email’: user.email
})
class JobDetailViewSet(viewsets.ModelViewSet):
queryset=JobDetails.objects.all()
serializer_class=JobDetailsSerializer
class JobViewSet(viewsets.ModelViewSet):
queryset = Jobs.objects.all()
serializer_class = JobSerializer
class JobFunctionViewset(viewsets.ModelViewSet):
queryset = JobFunctions.objects.all()
serializer_class = JobFunctionSerializer
class JobIndustriesViewset(viewsets.ModelViewSet):
queryset = JobIndustries.objects.all()
serializer_class = JobIndustriesSerializer
class JobLocationViewset(viewsets.ModelViewSet):
queryset = JobLocation.objects.all()
serializer_class = JobLocationSerializer
class JobImageViewset(viewsets.ModelViewSet):
queryset = JobImages.objects.all()
serializer_class = JobImagesSerializer
Step 5: Creating the API Endpoints
Using the Django REST framework, I created serializers and views to expose the job data through API endpoints.
from .models import Jobs, JobFunctions, JobLocation, JobIndustries, JobDetails, JobImages
from datetime import datetime, timedelta
from django.contrib.auth.models import User
from rest_framework import serializers
class UserSerializer(serializers.HyperlinkedModelSerializer):
class Meta:
model = User
fields = [‘url’, ‘username’, ’email’, ‘FirstName’,’LastName’]
class JobDetailsSerializer(serializers.ModelSerializer):
class Meta:
model=JobDetails
fields=’__all__’
class JobFunctionSerializer(serializers.ModelSerializer):
class Meta:
model=JobFunctions
fields=’__all__’
class JobIndustriesSerializer(serializers.ModelSerializer):
class Meta:
model=JobIndustries
fields=’__all__’
class JobLocationSerializer(serializers.ModelSerializer):
class Meta:
model=JobLocation
fields=’__all__’
class JobImagesSerializer(serializers.ModelSerializer):
class Meta:
model=JobImages
fields=’__all__’
class JobSerializer(serializers.ModelSerializer):
Job_Function = JobFunctionSerializer()
Job_Industries = JobIndustriesSerializer()
Job_Location = JobLocationSerializer()
Job_Image = JobImagesSerializer()
job_details=JobDetailsSerializer(many=True)
class Meta:
model = Jobs
fields = [‘id’, ‘job_title’, ‘scraped_date’,’job_link’, ‘Job_Function’, ‘Job_Industries’, ‘Job_Location’,’Job_Image’, ‘job_details’]
I then set up the URLs for the API in urls.py.
from .views import JobViewSet, JobDetailViewSet, JobFunctionViewset, JobLocationViewset, JobIndustriesViewset
from rest_framework import routers
rt = routers.DefaultRouter()
rt.register(r’BrighterMondayJobs’, JobViewSet, basename=’brighter’)
rt.register(r’job_details’, JobDetailViewSet)
rt.register(r’job_functions’, JobFunctionViewset)
rt.register(r’job_locations’, JobLocationViewset)
rt.register(r’job_industries’, JobIndustriesViewset)
urlpatterns = [
path(”, include(rt.urls)),
# path(‘api/accounts/’ , include(‘accounts.urls’))
]
And included the app’s URLs in the project’s urls.py.
URL configuration for myapp project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path(”, views.home, name=’home’)
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path(”, Home.as_view(), name=’home’)
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path(‘blog/’, include(‘blog.urls’))
“””
from django.contrib import admin
from django.urls import path, include
from scraper import urls as kim
from scraper.views import ListUsers,CustomAuthToken
# from rest_framework_simplejwt.views import (
# TokenObtainPairView,
# TokenRefreshView,
# )
urlpatterns = [
path(‘admin/’, admin.site.urls),
path(‘api/users/’, ListUsers.as_view()),
path(‘api/token/auth/’, CustomAuthToken.as_view()),
# path(‘api/token/’, TokenObtainPairView.as_view(), name=’token_obtain_pair’),
# path(‘api/token/refresh/’, TokenRefreshView.as_view(), name=’token_refresh’),
path(”, include(kim))
]
Here’s my github repo of the same Github Link
Am glad to be part of the HNG Internship opportunity which I would recommend being part of them through HNG internship or HNG Hire
You can reach out to me via cheruiyotdanielcaleb@gmail.com