new script
This commit is contained in:
parent
862a0efd37
commit
92cd221dde
87
extract_v2.py
Normal file
87
extract_v2.py
Normal file
@ -0,0 +1,87 @@
|
||||
from airflow import DAG
|
||||
from airflow.decorators import task
|
||||
from airflow.models import Variable
|
||||
from datetime import datetime, date, timedelta
|
||||
from pymongo import MongoClient
|
||||
|
||||
from jobspy import scrape_jobs
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
import graypy
|
||||
import logging
|
||||
|
||||
DEBUG = False
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
}
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger('JobBot')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Configure graypy handler for GELF UDP
|
||||
graylog_handler = graypy.GELFUDPHandler('graylog.localdomain', 12201)
|
||||
graylog_handler.include_logger_name = True
|
||||
logger.addHandler(graylog_handler)
|
||||
|
||||
def load_search_config(config_path):
|
||||
with open(config_path, 'r') as file:
|
||||
return yaml.safe_load(file)
|
||||
|
||||
def process_jobs(search_params):
|
||||
print(f"Scraping jobs with parameters: {search_params['name']}")
|
||||
jobs = scrape_jobs(**search_params['params'])
|
||||
return jobs
|
||||
|
||||
def date_to_datetime(d):
|
||||
if isinstance(d, date) and not isinstance(d, datetime):
|
||||
return datetime.combine(d, datetime.min.time())
|
||||
return d
|
||||
|
||||
with DAG(
|
||||
'job_bot_api_dag',
|
||||
default_args=default_args,
|
||||
description='A DAG to fetch data from job-bot API and process it',
|
||||
schedule='*/10 * * * *', # Every 10 minutes
|
||||
start_date=datetime.now() - timedelta(days=1), # Changed to today-1 day
|
||||
catchup=False,
|
||||
max_active_runs=1,
|
||||
tags=['job-bot', 'api'],
|
||||
) as dag:
|
||||
|
||||
@task()
|
||||
def fetch_jobs():
|
||||
# Load configuration
|
||||
config = load_search_config('search_criteria.yaml')
|
||||
|
||||
jobs = pd.DataFrame()
|
||||
|
||||
# Process each search configuration
|
||||
for search in config['searches']:
|
||||
try:
|
||||
_jobs = process_jobs(search)
|
||||
if len(_jobs) > 0:
|
||||
# Apply filters from search configuration if they exist
|
||||
if 'filter' in search:
|
||||
filter_field = search['filter']['field']
|
||||
exclude_list = search['filter']['exclude']
|
||||
_jobs = _jobs[~_jobs[filter_field].str.contains('|'.join(exclude_list), case=False, na=False)]
|
||||
jobs = pd.concat([jobs, _jobs])
|
||||
except Exception as e:
|
||||
print(f"Error processing search '{search['name']}': {str(e)}")
|
||||
continue
|
||||
# Basic stats
|
||||
print(f"Found {len(jobs)} jobs")
|
||||
|
||||
# TaskFlow dependencies
|
||||
api_results = fetch_jobs()
|
||||
|
||||
if __name__ == "__main__":
|
||||
dag.test()
|
||||
@ -38,6 +38,7 @@ with DAG(
|
||||
schedule='*/10 * * * *', # Every 10 minutes
|
||||
start_date=datetime.now() - timedelta(days=1), # Changed to today-1 day
|
||||
catchup=False,
|
||||
max_active_runs=1,
|
||||
tags=['job-bot', 'api'],
|
||||
) as dag:
|
||||
|
||||
|
||||
58
search_criteria.yaml
Normal file
58
search_criteria.yaml
Normal file
@ -0,0 +1,58 @@
|
||||
# ├── Indeed limitations:
|
||||
# | Only one from this list can be used in a search:
|
||||
# | - hours_old
|
||||
# | - job_type & is_remote
|
||||
# | - easy_apply
|
||||
# │
|
||||
# └── LinkedIn limitations:
|
||||
# | Only one from this list can be used in a search:
|
||||
# | - hours_old
|
||||
# | - easy_apply
|
||||
|
||||
searches:
|
||||
- name: indeed_engineering_manager
|
||||
params:
|
||||
site_name: ["indeed"]
|
||||
search_term: '"engineering manager"'
|
||||
location: "Mill Valley, CA"
|
||||
distance: 17
|
||||
results_wanted: 10
|
||||
country_indeed: 'USA'
|
||||
|
||||
- name: indeed_engineering_manager_remote
|
||||
params:
|
||||
site_name: ["indeed"]
|
||||
search_term: '"engineering manager"'
|
||||
is_remote: true
|
||||
results_wanted: 10
|
||||
country_indeed: 'USA'
|
||||
|
||||
- name: google_engineering_manager
|
||||
params:
|
||||
site_name: ["google"]
|
||||
google_search_term: 'senior engineering manager'
|
||||
location: "San Francisco, CA"
|
||||
results_wanted: 10
|
||||
hours_old: 24
|
||||
filter:
|
||||
field: "job_url"
|
||||
exclude: ["https://www.linkedin.com", "https://www.glassdoor.com", "https://www.indeed.com"]
|
||||
|
||||
- name: linkedin_engineering_manager
|
||||
params:
|
||||
site_name: ["linkedin"]
|
||||
search_term: 'senior engineering manager'
|
||||
linkedin_fetch_description: true
|
||||
location: "Mill Valley, CA"
|
||||
distance: 17
|
||||
results_wanted: 10
|
||||
hours_old: 24
|
||||
|
||||
- name: linkedin_engineering_manager_remote
|
||||
params:
|
||||
site_name: ["linkedin"]
|
||||
search_term: 'senior engineering manager'
|
||||
linkedin_fetch_description: true
|
||||
is_remote: true
|
||||
results_wanted: 10
|
||||
hours_old: 24
|
||||
Loading…
x
Reference in New Issue
Block a user