Examples ======== Count the number of even and odd numbers from 0 through 9999:: >>> from bndl.compute.run import ctx >>> ctx.range(10000) # start with some numbers \ ... .map(lambda i: (i % 2, 1)) # create tuples (odd/even, 1) \ ... .reduce_by_key(lambda a, b: a + b) # sum the 1's \ ... .collect() # and collect to the driver \ [(0, 5000), (1, 5000)] Take a look at the BNDL source files:: >>> files = ctx.files('*/bndl*', ffilter=re.compile(r'\.pyx?$').search) >>> files.count() 171 >>> files.map_values(lambda f: len(f.split(b'\n'))).nlargest(3, key=1) [('bndl/bndl/compute/dataset.py', 1900), ('bndl/bndl/compute/shuffle.py', 889), ('bndl/bndl/util/cypickle.pyx', 869)] >>> files.lines().map(str.strip).filter().map(len).stats() Stuff with orcid:: jsons = ctx.files('./orcid/').decode().values().map(json.loads) orcid = ctx.broadcast(jsons.flatmap(orcid_recs).group_by_key().collect()) docs = ctx.cassandra_table('adg', 'document') auths = ctx.cassandra_table('adg', 'authorship') auths_by_doi = docs.coscan(authorships, keys=['doc_id'] * 2) \ .map_keys(lambda doc: doc.doi) matches = auths_by_doi.flatmap(select_matches) matches.map_partitions(partial(update_adg, ctx)).execute() Scrape some urls:: def scrape_urls(part_idx, urls): client = get_client(part_idx) for url in urls: yield url, client.execute_script(GET_TEXT) urls = list(open('adis_urls.txt')) ctx.conf['bndl.execute.concurrency'] = 4 pcount = max(ctx.default_pcount, len(urls) // 100) ctx.collection(urls, pcount=pcount) \ .map_partitions_with_index(scrape_urls) .collect_as_json('./pages/') Grid search CV:: from bndl_ml.gridsearch import GridSearchCV search = GridSearchCV(ctx, estimator, param_grid, scoring, fit_params, iid, refit, cv, error_score) search.fit(X, y) search.best_estimator_