From 92cd221dde796c2e93f606d68a8d1a1b98de31c8 Mon Sep 17 00:00:00 2001 From: Jason Date: Fri, 30 May 2025 10:38:04 -0700 Subject: [PATCH] new script --- extract_v2.py | 87 ++++++++++++++++++++++++++++++++++++++++++++ job-bot.py | 1 + search_criteria.yaml | 58 +++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) create mode 100644 extract_v2.py create mode 100644 search_criteria.yaml diff --git a/extract_v2.py b/extract_v2.py new file mode 100644 index 0000000..efddbba --- /dev/null +++ b/extract_v2.py @@ -0,0 +1,87 @@ +from airflow import DAG +from airflow.decorators import task +from airflow.models import Variable +from datetime import datetime, date, timedelta +from pymongo import MongoClient + +from jobspy import scrape_jobs + +import pandas as pd +import yaml +import graypy +import logging + +DEBUG = False + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=5), +} + +# Configure logging +logger = logging.getLogger('JobBot') +logger.setLevel(logging.INFO) + +# Configure graypy handler for GELF UDP +graylog_handler = graypy.GELFUDPHandler('graylog.localdomain', 12201) +graylog_handler.include_logger_name = True +logger.addHandler(graylog_handler) + +def load_search_config(config_path): + with open(config_path, 'r') as file: + return yaml.safe_load(file) + +def process_jobs(search_params): + print(f"Scraping jobs with parameters: {search_params['name']}") + jobs = scrape_jobs(**search_params['params']) + return jobs + +def date_to_datetime(d): + if isinstance(d, date) and not isinstance(d, datetime): + return datetime.combine(d, datetime.min.time()) + return d + +with DAG( + 'job_bot_api_dag', + default_args=default_args, + description='A DAG to fetch data from job-bot API and process it', + schedule='*/10 * * * *', # Every 10 minutes + start_date=datetime.now() - timedelta(days=1), # Changed to today-1 day + catchup=False, + max_active_runs=1, + tags=['job-bot', 'api'], +) as dag: + + @task() + def fetch_jobs(): + # Load configuration + config = load_search_config('search_criteria.yaml') + + jobs = pd.DataFrame() + + # Process each search configuration + for search in config['searches']: + try: + _jobs = process_jobs(search) + if len(_jobs) > 0: + # Apply filters from search configuration if they exist + if 'filter' in search: + filter_field = search['filter']['field'] + exclude_list = search['filter']['exclude'] + _jobs = _jobs[~_jobs[filter_field].str.contains('|'.join(exclude_list), case=False, na=False)] + jobs = pd.concat([jobs, _jobs]) + except Exception as e: + print(f"Error processing search '{search['name']}': {str(e)}") + continue + # Basic stats + print(f"Found {len(jobs)} jobs") + + # TaskFlow dependencies + api_results = fetch_jobs() + + if __name__ == "__main__": + dag.test() \ No newline at end of file diff --git a/job-bot.py b/job-bot.py index 91001a9..d5981c1 100644 --- a/job-bot.py +++ b/job-bot.py @@ -38,6 +38,7 @@ with DAG( schedule='*/10 * * * *', # Every 10 minutes start_date=datetime.now() - timedelta(days=1), # Changed to today-1 day catchup=False, + max_active_runs=1, tags=['job-bot', 'api'], ) as dag: diff --git a/search_criteria.yaml b/search_criteria.yaml new file mode 100644 index 0000000..c2b020e --- /dev/null +++ b/search_criteria.yaml @@ -0,0 +1,58 @@ +# ├── Indeed limitations: +# | Only one from this list can be used in a search: +# | - hours_old +# | - job_type & is_remote +# | - easy_apply +# │ +# └── LinkedIn limitations: +# | Only one from this list can be used in a search: +# | - hours_old +# | - easy_apply + +searches: + - name: indeed_engineering_manager + params: + site_name: ["indeed"] + search_term: '"engineering manager"' + location: "Mill Valley, CA" + distance: 17 + results_wanted: 10 + country_indeed: 'USA' + + - name: indeed_engineering_manager_remote + params: + site_name: ["indeed"] + search_term: '"engineering manager"' + is_remote: true + results_wanted: 10 + country_indeed: 'USA' + + - name: google_engineering_manager + params: + site_name: ["google"] + google_search_term: 'senior engineering manager' + location: "San Francisco, CA" + results_wanted: 10 + hours_old: 24 + filter: + field: "job_url" + exclude: ["https://www.linkedin.com", "https://www.glassdoor.com", "https://www.indeed.com"] + + - name: linkedin_engineering_manager + params: + site_name: ["linkedin"] + search_term: 'senior engineering manager' + linkedin_fetch_description: true + location: "Mill Valley, CA" + distance: 17 + results_wanted: 10 + hours_old: 24 + + - name: linkedin_engineering_manager_remote + params: + site_name: ["linkedin"] + search_term: 'senior engineering manager' + linkedin_fetch_description: true + is_remote: true + results_wanted: 10 + hours_old: 24 \ No newline at end of file