Futures II
Overview
In this lecture, we reviwed Rust Futures, which we introduced last time, and coded up a revamped version of our Link Explorer using async/await. Below, you can see the original code that ran with a ThreadPool. Below that, we have the code using async/await.
ThreadPool Link Explorer
extern crate reqwest;
extern crate select;
#[macro_use]
extern crate error_chain;
use std::sync::{Arc, Mutex};
use std::{thread};
use select::document::Document;
use select::predicate::Name;
error_chain! {
foreign_links {
ReqError(reqwest::Error);
IoError(std::io::Error);
}
}
struct Article {
url: String,
len: usize,
}
const BATCH_SIZE: usize = 60;
// https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html
fn main() -> Result<()> {
let body = reqwest::blocking::get("https://en.wikipedia.org/wiki/Multithreading_(computer_architecture)")?
.text()?;
// Identify all linked wikipedia pages
let links = Document::from_read(body.as_bytes())?
.find(Name("a"))
.filter_map(|n| {
if let Some(link_str) = n.attr("href") {
if link_str.starts_with("/wiki/") {
Some(format!("{}/{}", "https://en.wikipedia.org",
&link_str[1..]))
} else {
None
}
} else {
None
}
}).collect::<Vec<String>>();
let longest_article = Arc::new(Mutex::new(Article {url: "".to_string(),
len: 0}));
let num_batches = links.len()/BATCH_SIZE;
println!("num_batches: {}", num_batches);
for batch_idx in 0..num_batches {
// println!("link: {}", link);
println!("batch_idx: {}", batch_idx);
let mut reqwesters = Vec::new();
let start = batch_idx * BATCH_SIZE;
let end = std::cmp::min((batch_idx + 1) * BATCH_SIZE, links.len());
for link in &links[start..end] {
let longest_article_clone = longest_article.clone();
let link_clone = link.clone();
reqwesters.push(thread::spawn(move || {
let body = reqwest::blocking::get(&link_clone).expect("").text().expect("");
let curr_len = body.len();
let mut longest_article_ref = longest_article_clone.lock().unwrap();
if curr_len > longest_article_ref.len {
longest_article_ref.len = curr_len;
longest_article_ref.url = link_clone.to_string();
}
}));
}
for handle in reqwesters {
handle.join().expect("Panic occurred in thread!");
}
//println!("page length: {}", curr_len);
}
let longest_article_ref = longest_article.lock().unwrap();
println!("{} was the longest article with length {}", longest_article_ref.url,
longest_article_ref.len);
Ok(())
}
Async Link Explorer
extern crate reqwest;
extern crate select;
#[macro_use]
extern crate error_chain;
use std::sync::{Arc};
use tokio::sync::{Mutex, Semaphore};
use tokio::task;
use select::document::Document;
use select::predicate::Name;
use futures;
error_chain! {
foreign_links {
ReqError(reqwest::Error);
IoError(std::io::Error);
}
}
struct Article {
url: String,
len: usize,
}
const BATCH_SIZE: usize = 60;
async fn get_body_text(link: &String, connection_permits: Arc<Semaphore>) -> Result<String> {
let _permit = connection_permits.acquire().await;
// Once the permit is dropped, it will increment the semaphore
let body = reqwest::get(link)
.await?
.text()
.await?;
Ok(body)
}
// https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html
#[tokio::main(max_threads = 20)]
async fn main() -> Result<()> {
let body = reqwest::get("https://en.wikipedia.org/wiki/Multithreading_(computer_architecture)")
.await?
.text()
.await?;
// Identify all linked wikipedia pages
let links = Document::from_read(body.as_bytes())?
.find(Name("a"))
.filter_map(|n| {
if let Some(link_str) = n.attr("href") {
if link_str.starts_with("/wiki/") {
Some(format!("{}/{}", "https://en.wikipedia.org",
&link_str[1..]))
} else {
None
}
} else {
None
}
}).collect::<Vec<String>>();
// println!("links: {:?}", links);
let longest_article = Arc::new(Mutex::new(Article {url: "".to_string(),
len: 0}));
let connection_permits = Arc::new(Semaphore::new(BATCH_SIZE));
let mut handles = Vec::new();
for link in &links {
let longest_article_clone = longest_article.clone();
let link_clone = link.clone();
let connection_permits_clone = connection_permits.clone();
handles.push(task::spawn(async move {
if let Ok(body) = get_body_text(&link_clone, connection_permits_clone).await {
let curr_len = body.len();
let mut longest_article_ref = longest_article_clone.lock().await;
if curr_len > longest_article_ref.len {
longest_article_ref.len = curr_len;
longest_article_ref.url = link_clone.to_string();
}
}
//println!("{:?}", get_body_text(&link_clone, connection_permits_clone).await);
}));
}
// https://github.com/tokio-rs/tokio/issues/2053
futures::future::join_all(handles).await;
let longest_article_ref = longest_article.lock().await;
println!("{} was the longest article with length {}", longest_article_ref.url,
longest_article_ref.len);
Ok(())
}