My First Dev Job

A write-up on my first valuable contribution as a productive web dev of society!

April 19, 2021

Update: since writing this post, I have been contracted by HubSpot for multiple other scraping gigs, each involving the collection of similar data from other incubators/accelerators with vastly different requirements and challenges.

I was very fortunate to be hired as an independent contractor for HubSpot in order to scrape some valuable pieces of information about active companies at Y Combinator, the startup accelerator.

Their website is a React app, and they offer no public-facing API. To further complicate things, (1) they only display search results as you scroll down the page, (2) despite how many results fit the options you select, only the first 1000 are shown, and (3) limited information about each company is provided in the "index" page -- you must navigate to the "show" page of a company to get anything useful.

In architecting the algorithm to scrape the details of each company from a search result totaling ~2000 companies, I am most proud of how I overcame the obstacle of the 1000 entry limit.

My solution was to limit the aggregation of "show" endpoints by introducing a second query. The most populated listing of active companies within a particular batch was ~300, so by looping over each batch of active companies, I was able to collect the "show" endpoints of all ~2000 desired company pages on the accelerator's website.

Grouping multiple batches that together totalled less than 1000 results would have been more performative than simply going one batch at a time, but I skimped on that because I didn't want to bill my customer for unnecessary work.

I used the npm library Puppeteer to be the puppet master of a robot browser. I programmed the script to first click the "See all options" button as only the first few batch names display on load. Once React hydrated the index page with the HTML I needed to get a list of all the batches available to query, I automated the browser to scroll down each batch listing page until it couldn't scroll down no more. That gave me access to the HTML with the endpoints for each company page in each batch. All that was left to do was automate the visiting all ~2000 pages to get the actual details on each company that I needed.

A really cool library called cheerio was used to implement jQuery for parsing HTML on the server. Check out jsonexport to convert Javascript Objects to CSV, and fs & path to write files.

I'd love you guys to watch this abbreviated clip shown below of the browser doing its thing! In the actual execution of the script, many, many more pages were visited. Despite running the puppeteer instance in headless mode and applying other performance tweaks, the scrape still took about an hour to execute on my machine.

Here is a link to the script, and the code below for your convenience!


const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const jsonexport = require('jsonexport');
const fs = require('fs');
const path = require('path');

const seed = async () => {
    const browser = await puppeteer.launch({
        headless: true,
        args: ['--no-sandbox'],
        // headless: false, // DEMO ONLY
        // defaultViewport: null, // DEMO ONLY
    });
    const page = await browser.newPage();

    const scrapeOneCompany = async (companyID) => {
        const oneCompany = {
            name: '',
            batch: '',
            url: '',
        };

        try {
            await page.goto(
                `https://www.ycombinator.com/companies/${companyID.toString()}`,
                {
                    waitUntil: 'networkidle0',
                    timeout: 15000,
                }
            );
        } catch (error) {
            console.log(`goto error: ${companyID}`);
            return new Promise(function (myResolve) {
                myResolve(oneCompany);
            });
        }

        try {
            oneCompany.name = await page.evaluate(
                () =>
                    document.querySelector(
                        'body > div.content > section.flex-row.company-info > div.main-column > div:nth-child(2) > h1'
                    ).innerHTML
            );
        } catch (error) {
            console.log(`name error: ${companyID}`);
            oneCompany.name = 'null';
        }

        try {
            oneCompany.batch = await page.evaluate(
                () =>
                    document.querySelector(
                        'body > div.content > section.flex-row.company-info > div.main-column > div.flex-row.align-center > div:nth-child(2) > span'
                    ).innerText
            );
        } catch (error) {
            console.log(`batch error: ${companyID}`);
            oneCompany.batch = 'null';
        }

        try {
            oneCompany.url = await page.evaluate(
                () =>
                    document.querySelector(
                        'body > div.content > section.flex-row.company-info > div.main-column > div:nth-child(2) > div > a'
                    ).innerText
            );
        } catch (error) {
            console.log(`url error: ${companyID}`);
            oneCompany.url = `https://www.ycombinator.com/companies/${companyID.toString()}`;
        }

        return new Promise(function (myResolve) {
            myResolve(oneCompany);
        });
    };

    const scrapeAllCompanies = async (companyIDs) => {
        const allCompanyData = [];
        for (let i = 0; i < companyIDs.length; i++) {
            allCompanyData.push(await scrapeOneCompany(companyIDs[i]));
            console.log(`scrape company #: ${i}`);
        }

        await page.close();
        await browser.close();

        return new Promise(function (myResolve) {
            myResolve(allCompanyData);
        });
    };

    const scrapeCompanyList = async () => {
        const scrapeBatchList = async () => {
            //* goto page
            await page.goto(
                'https://www.ycombinator.com/companies?status=Active',
                {
                    waitUntil: 'networkidle0',
                    timeout: 15000,
                }
            );

            //*expand batch list
            await page.click('.styles-module__showMoreLess___11d7N');
            //* short pause for react
            await page.waitForTimeout(100);

            //* isolate the appropriate HTML
            const batchHTML = await page.evaluate(
                () =>
                    document.querySelector(
                        'body > div.content > section > div > div > div.styles-module__leftCol___32BDa > div > div:nth-child(5)'
                    ).innerHTML
            );

            //* convert to cheerio
            const $ = cheerio.load(batchHTML, null, false);

            //* parse company IDs into array
            const batchList = $('span[class=styles-module__label___1Z_B0]')
                .contents()
                .map(function () {
                    return this.type === 'text' ? $(this).text() + ' ' : '';
                })
                .get()
                .join('')
                .split(' ')
                .slice(2, -1);

            return new Promise(function (myResolve) {
                myResolve(batchList);
            });
        };

        const scrapeOneBatch = async (batchID) => {
            try {
                await page.goto(
                    `https://www.ycombinator.com/companies?batch=${batchID}&status=Active`,
                    {
                        waitUntil: 'networkidle0',
                        timeout: 15000,
                    }
                );
            } catch (error) {
                console.log(`scrapeOneBatch error: ${batchID}`);
                return new Promise(function (myResolve) {
                    myResolve([]);
                });
            }

            const autoScrollAndScrape = async (page) => {
                await page.evaluate(async () => {
                    await new Promise((resolve, reject) => {
                        var totalHeight = 0;
                        var distance = 100;
                        var timer = setInterval(() => {
                            var scrollHeight = document.body.scrollHeight;
                            window.scrollBy(0, distance);
                            totalHeight += distance;

                            if (totalHeight >= scrollHeight) {
                                clearInterval(timer);
                                resolve();
                            }
                        }, 100);
                    });
                });

                let singleBatchLinksHTML;
                try {
                    singleBatchLinksHTML = await page.evaluate(
                        () =>
                            document.querySelector(
                                'body > div.content > section > div > div > div.styles-module__rightCol___2NKRr > div.styles-module__section___2yul1.styles-module__results___2lP37'
                            ).innerHTML
                    );
                } catch (error) {
                    console.log(`singleBatchLinksHTML error: ${batchID}`);
                    singleBatchLinksHTML = '';
                }

                return new Promise(function (myResolve) {
                    myResolve(singleBatchLinksHTML);
                });
            };

            const singleBatchLinksHTML = await autoScrollAndScrape(page);

            //* convert to cheerio
            const $ = cheerio.load(singleBatchLinksHTML, null, false);

            const aEls = $('a');

            let singleBatchURLS = [];
            $(aEls).each(function (_, link) {
                singleBatchURLS.push($(link).attr('href').split('/').pop());
            });

            return new Promise(function (myResolve) {
                console.log(`retrieved links of: ${singleBatchURLS.length}`);
                myResolve(singleBatchURLS);
            });
        };

        const scrapeAllBatches = async (batchList) => {
            let companyList = [];

            //! for whole scrape
            for (let i = 0; i < batchList.length; i++) {
                const singleBatchLinks = await scrapeOneBatch(batchList[i]);
                companyList = companyList.concat(singleBatchLinks);
            }

            //! FOR DEMO PURPOSES ONLY
            /*for (
                let i = batchList.indexOf('S13');
                i < batchList.indexOf('S12');
                i++
            ) {
                const singleBatchLinks = await scrapeOneBatch(batchList[i]);
                companyList = companyList.concat(singleBatchLinks);
}*/

            return new Promise(function (myResolve) {
                myResolve(companyList);
            });
        };

        const batchList = await scrapeBatchList();
        const companyList = await scrapeAllBatches(batchList);

        return new Promise(function (myResolve) {
            myResolve(companyList);
        });
    };

    const date = new Date().toISOString().split('T').shift();
    console.log(`starting scrape on ${date}`);
    const activeCompanyData = await scrapeAllCompanies(
        await scrapeCompanyList()
    );

    jsonexport(activeCompanyData, function (err, csv) {
        if (err) return console.error(err);
        fs.writeFileSync(
            path.join(__dirname, 'out', `${date}.csv`),
            csv,
            function (err) {
                if (err) throw err;
                console.log('Saved!');
            }
        );
    });
};
seed();