1
0
Fork 0

first commit

This commit is contained in:
Florian RICHER 2022-06-16 23:13:58 +02:00
commit 1a72eaeaaf
11 changed files with 2187 additions and 0 deletions

1
.env Normal file
View file

@ -0,0 +1 @@
URL=http://vk.gy/database

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

45
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,45 @@
{
// Utilisez IntelliSense pour en savoir plus sur les attributs possibles.
// Pointez pour afficher la description des attributs existants.
// Pour plus d'informations, visitez : https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "lldb",
"request": "launch",
"name": "Debug executable 'data_extractor'",
"cargo": {
"args": [
"build",
"--bin=data_extractor",
"--package=data_extractor"
],
"filter": {
"name": "data_extractor",
"kind": "bin"
}
},
"args": [],
"cwd": "${workspaceFolder}"
},
{
"type": "lldb",
"request": "launch",
"name": "Debug unit tests in executable 'data_extractor'",
"cargo": {
"args": [
"test",
"--no-run",
"--bin=data_extractor",
"--package=data_extractor"
],
"filter": {
"name": "data_extractor",
"kind": "bin"
}
},
"args": [],
"cwd": "${workspaceFolder}"
}
]
}

1557
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

11
Cargo.toml Normal file
View file

@ -0,0 +1,11 @@
[package]
name = "data_extractor"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
reqwest = {version = "0.11", features = ["blocking"]}
scraper = "0.13.0"
dotenvy = "0.15.1"

110
src/extractor/artists.rs Normal file
View file

@ -0,0 +1,110 @@
use scraper::Html;
use super::{Extractor, trim_whitespace, parse_date};
#[derive(Debug, Default)]
pub struct Artist {
pub name: String,
pub url: String,
pub date: String,
pub author: String,
pub author_url: String,
}
impl Extractor for Artist {
type Output = Self;
fn extract_all(document: Html) -> Vec<Self> {
let mut artists : Vec<Self> = Vec::new();
let selector = scraper::Selector::parse("ul>li").unwrap();
let select = document.select(&selector);
for el in select {
artists.push(Self::extract(Html::parse_fragment(&el.html())));
}
artists
}
fn extract(document: Html) -> Self {
let a_el = document.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let title = a_el.inner_html();
let url = a_el.value().attr("href").unwrap();
let more_info_el = document.select(&scraper::Selector::parse("div").unwrap()).next().unwrap();
let date = more_info_el.inner_html();
let author_el = more_info_el.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let author = author_el.inner_html();
let author_url = author_el.value().attr("href").unwrap();
Self {
name: trim_whitespace(&title),
url: trim_whitespace(url),
date: parse_date(&date),
author: trim_whitespace(&author),
author_url: trim_whitespace(author_url),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
const EXAMPLES: &'static str =
r#"
<ul>
<li>
<a class="symbol__artist" href="/artists/babykingdom/">BabyKingdom</a>
<div class="h5 any--no-wrap">
2022-06-16 14:12:03 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/suji/">suji</a>
</div>
</li>
<li>
<a class="symbol__artist" href="/artists/hiro-ruvish/">HIRO</a>
<div class="h5 any--no-wrap">
2022-06-16 11:11:06 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/kumika/">kumika</a>
</div>
</li>
</ul>
"#;
#[test]
fn extract_all_must_return_two_result() {
let document = scraper::Html::parse_fragment(EXAMPLES);
let artists = Artist::extract_all(document);
assert_eq!(artists.len(), 2);
assert_eq!(artists[0].author, "suji");
assert_eq!(artists[0].author_url, "/users/suji/");
assert_eq!(artists[0].name, "BabyKingdom");
assert_eq!(artists[0].url, "/artists/babykingdom/");
assert_eq!(artists[0].date, "2022-06-16 14:12:03");
assert_eq!(artists[1].author, "kumika");
assert_eq!(artists[1].author_url, "/users/kumika/");
assert_eq!(artists[1].name, "HIRO");
assert_eq!(artists[1].url, "/artists/hiro-ruvish/");
assert_eq!(artists[1].date, "2022-06-16 11:11:06");
}
const EXAMPLE: &'static str = r#"
<a class="symbol__artist" href="/artists/babykingdom/">BabyKingdom</a>
<div class="h5 any--no-wrap">
2022-06-16 14:12:03 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/suji/">suji</a>
</div>
"#;
#[test]
fn extract_must_be_return_correct_value() {
let document = scraper::Html::parse_fragment(EXAMPLE);
let artist = Artist::extract(document);
assert_eq!(artist.author, "suji");
assert_eq!(artist.author_url, "/users/suji/");
assert_eq!(artist.name, "BabyKingdom");
assert_eq!(artist.url, "/artists/babykingdom/");
assert_eq!(artist.date, "2022-06-16 14:12:03");
}
}

110
src/extractor/labels.rs Normal file
View file

@ -0,0 +1,110 @@
use scraper::Html;
use super::{Extractor, trim_whitespace, parse_date};
#[derive(Debug, Default)]
pub struct Label {
pub name: String,
pub url: String,
pub date: String,
pub author: String,
pub author_url: String,
}
impl Extractor for Label {
type Output = Self;
fn extract_all(document: Html) -> Vec<Self> {
let mut artists : Vec<Self> = Vec::new();
let selector = scraper::Selector::parse("ul>li").unwrap();
let select = document.select(&selector);
for el in select {
artists.push(Self::extract(Html::parse_fragment(&el.html())));
}
artists
}
fn extract(document: Html) -> Self {
let a_el = document.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let title = a_el.inner_html();
let url = a_el.value().attr("href").unwrap();
let more_info_el = document.select(&scraper::Selector::parse("div").unwrap()).next().unwrap();
let date = more_info_el.inner_html();
let author_el = more_info_el.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let author = author_el.inner_html();
let author_url = author_el.value().attr("href").unwrap();
Self {
name: trim_whitespace(&title),
url: trim_whitespace(url),
date: parse_date(&date),
author: trim_whitespace(&author),
author_url: trim_whitespace(author_url),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
const EXAMPLES: &'static str =
r#"
<ul>
<li>
<a class="symbol__company" href="/labels/arcanumania-records/">ARCANUMANIA Records.</a>
<div class="h5 any--no-wrap">
2022-06-13 21:27:11 by
<a class="user a--inherit" data-icon="" data-is-vip="0" href="/users/haru/">haru</a>
</div>
</li>
<li>
<a class="symbol__company" href="/labels/omega-code/">Omega Code</a>
<div class="h5 any--no-wrap">
2022-06-12 18:43:41 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/inartistic/">inartistic</a>
</div>
</li>
</ul>
"#;
#[test]
fn extract_all_must_return_two_result() {
let document = scraper::Html::parse_fragment(EXAMPLES);
let artists = Label::extract_all(document);
assert_eq!(artists.len(), 2);
assert_eq!(artists[0].name, "ARCANUMANIA Records.");
assert_eq!(artists[0].url, "/labels/arcanumania-records/");
assert_eq!(artists[0].date, "2022-06-13 21:27:11");
assert_eq!(artists[0].author, "haru");
assert_eq!(artists[0].author_url, "/users/haru/");
assert_eq!(artists[1].name, "Omega Code");
assert_eq!(artists[1].url, "/labels/omega-code/");
assert_eq!(artists[1].date, "2022-06-12 18:43:41");
assert_eq!(artists[1].author, "inartistic");
assert_eq!(artists[1].author_url, "/users/inartistic/");
}
const EXAMPLE: &'static str = r#"
<a class="symbol__company" href="/labels/arcanumania-records/">ARCANUMANIA Records.</a>
<div class="h5 any--no-wrap">
2022-06-13 21:27:11 by
<a class="user a--inherit" data-icon="" data-is-vip="0" href="/users/haru/">haru</a>
</div>
"#;
#[test]
fn extract_must_be_return_correct_value() {
let document = scraper::Html::parse_fragment(EXAMPLE);
let artist = Label::extract(document);
assert_eq!(artist.author, "haru");
assert_eq!(artist.author_url, "/users/haru/");
assert_eq!(artist.name, "ARCANUMANIA Records.");
assert_eq!(artist.url, "/labels/arcanumania-records/");
assert_eq!(artist.date, "2022-06-13 21:27:11");
}
}

103
src/extractor/mod.rs Normal file
View file

@ -0,0 +1,103 @@
use std::env;
use scraper::Html;
mod artists;
mod labels;
mod musicians;
mod releases;
pub trait Extractor {
type Output;
fn extract_all(select: Html) -> Vec<Self::Output>;
fn extract(select: Html) -> Self::Output;
}
const RECENTLY_UPDATED_SELECTOR : &'static str= "body>div.c2>div";
const CATEGORY_TITLE_SELECTOR : &'static str= "h2>a";
const NEWS_TITLE_SELECTOR : &'static str= "div.text";
#[derive(Debug, Default)]
pub struct Informations {
pub artists: Vec<artists::Artist>,
pub labels: Vec<labels::Label>,
pub musicians: Vec<musicians::Musician>,
pub releases: Vec<releases::Release>,
}
pub fn extract() {
let response = reqwest::blocking::get(env::var("URL").unwrap())
.unwrap()
.text()
.unwrap();
let document = scraper::Html::parse_document(&response);
let div_selector = scraper::Selector::parse(RECENTLY_UPDATED_SELECTOR).unwrap();
let divs = document.select(&div_selector);
let mut categories = Informations::default();
for div in divs {
if let Some(category) = div.select(&scraper::Selector::parse(CATEGORY_TITLE_SELECTOR).unwrap()).next() {
let category_title = category.inner_html();
match category_title.as_str() {
"artists" => {
let content_div = div.select(&scraper::Selector::parse(NEWS_TITLE_SELECTOR).unwrap()).next().unwrap();
categories.artists = artists::Artist::extract_all(scraper::Html::parse_fragment(&content_div.inner_html()));
},
"labels" => {
let content_div = div.select(&scraper::Selector::parse(NEWS_TITLE_SELECTOR).unwrap()).next().unwrap();
categories.labels = labels::Label::extract_all(scraper::Html::parse_fragment(&content_div.inner_html()));
}
"musicians" => {
let content_div = div.select(&scraper::Selector::parse(NEWS_TITLE_SELECTOR).unwrap()).next().unwrap();
categories.musicians = musicians::Musician::extract_all(scraper::Html::parse_fragment(&content_div.inner_html()));
}
"releases" => {
let content_div = div.select(&scraper::Selector::parse(NEWS_TITLE_SELECTOR).unwrap()).next().unwrap();
categories.releases = releases::Release::extract_all(scraper::Html::parse_fragment(&content_div.inner_html()));
}
_ => {}
}
}
}
println!("{:#?}", categories);
}
pub(self) fn trim_whitespace(s: &str) -> String {
s.replace('\t', " ").replace("\n", " ").trim().to_owned()
}
pub(self) fn parse_date(s: &str) -> String {
trim_whitespace(s)[0..19].to_owned()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_trim_whitespace() {
assert_eq!(trim_whitespace("Hello World"), "Hello World");
}
#[test]
fn test_remove_whitespace_with_tabs() {
assert_eq!(trim_whitespace("\tHello\tWorld\t"), "Hello World");
}
#[test]
fn test_remove_whitespace_with_newlines() {
assert_eq!(trim_whitespace("\nHello\nWorld\n"), "Hello World");
}
#[test]
fn test_parse_date() {
assert_eq!(parse_date("2022-06-16 14:12:03 by <adata-icon=\"\"data-is-vip=\"1\"href=\"/users/suji/\"class=\"usera--inherit\">suji</a>"), "2022-06-16 14:12:03");
}
}

110
src/extractor/musicians.rs Normal file
View file

@ -0,0 +1,110 @@
use scraper::Html;
use super::{Extractor, trim_whitespace, parse_date};
#[derive(Debug, Default)]
pub struct Musician {
pub name: String,
pub url: String,
pub date: String,
pub author: String,
pub author_url: String,
}
impl Extractor for Musician {
type Output = Self;
fn extract_all(document: Html) -> Vec<Self> {
let mut artists : Vec<Self> = Vec::new();
let selector = scraper::Selector::parse("ul>li").unwrap();
let select = document.select(&selector);
for el in select {
artists.push(Self::extract(Html::parse_fragment(&el.html())));
}
artists
}
fn extract(document: Html) -> Self {
let a_el = document.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let title = a_el.inner_html();
let url = a_el.value().attr("href").unwrap();
let more_info_el = document.select(&scraper::Selector::parse("div").unwrap()).next().unwrap();
let date = more_info_el.inner_html();
let author_el = more_info_el.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let author = author_el.inner_html();
let author_url = author_el.value().attr("href").unwrap();
Self {
name: trim_whitespace(&title),
url: trim_whitespace(url),
date: parse_date(&date),
author: trim_whitespace(&author),
author_url: trim_whitespace(author_url),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
const EXAMPLES: &'static str =
r#"
<ul>
<li>
<a class="symbol__musician" href="/musicians/27075/hiko/">HIKO</a>
<div class="h5 any--no-wrap">
2022-06-16 14:12:02 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/suji/">suji</a>
</div>
</li>
<li>
<a class="symbol__musician" href="/musicians/6312/hiro/">HIRO</a>
<div class="h5 any--no-wrap">
2022-06-16 11:11:25 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/kumika/">kumika</a>
</div>
</li>
</ul>
"#;
#[test]
fn extract_all_must_return_two_result() {
let document = scraper::Html::parse_fragment(EXAMPLES);
let artists = Musician::extract_all(document);
assert_eq!(artists.len(), 2);
assert_eq!(artists[0].name, "HIKO");
assert_eq!(artists[0].url, "/musicians/27075/hiko/");
assert_eq!(artists[0].date, "2022-06-16 14:12:02");
assert_eq!(artists[0].author, "suji");
assert_eq!(artists[0].author_url, "/users/suji/");
assert_eq!(artists[1].name, "HIRO");
assert_eq!(artists[1].url, "/musicians/6312/hiro/");
assert_eq!(artists[1].date, "2022-06-16 11:11:25");
assert_eq!(artists[1].author, "kumika");
assert_eq!(artists[1].author_url, "/users/kumika/");
}
const EXAMPLE: &'static str = r#"
<a class="symbol__musician" href="/musicians/6312/hiro/">HIRO</a>
<div class="h5 any--no-wrap">
2022-06-16 11:11:25 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/kumika/">kumika</a>
</div>
"#;
#[test]
fn extract_must_be_return_correct_value() {
let document = scraper::Html::parse_fragment(EXAMPLE);
let artist = Musician::extract(document);
assert_eq!(artist.name, "HIRO");
assert_eq!(artist.url, "/musicians/6312/hiro/");
assert_eq!(artist.date, "2022-06-16 11:11:25");
assert_eq!(artist.author, "kumika");
assert_eq!(artist.author_url, "/users/kumika/");
}
}

132
src/extractor/releases.rs Normal file
View file

@ -0,0 +1,132 @@
use scraper::Html;
use super::{Extractor, trim_whitespace, parse_date};
#[derive(Debug, Default)]
pub struct Release {
pub name: String,
pub url: String,
pub album: String,
pub album_url: String,
pub date: String,
pub author: String,
pub author_url: String,
}
impl Extractor for Release {
type Output = Self;
fn extract_all(document: Html) -> Vec<Self> {
let mut artists : Vec<Self> = Vec::new();
let selector = scraper::Selector::parse("ul>li").unwrap();
let select = document.select(&selector);
for el in select {
artists.push(Self::extract(Html::parse_fragment(&el.html())));
}
artists
}
fn extract(document: Html) -> Self {
let a_el = document.select(&scraper::Selector::parse("div.any--weaken-color>a.artist").unwrap()).next().unwrap();
let title = a_el.inner_html();
let url = a_el.value().attr("href").unwrap();
let a_album_el = document.select(&scraper::Selector::parse("a.symbol__release").unwrap()).next().unwrap();
let album = a_album_el.inner_html();
let album_url = a_album_el.value().attr("href").unwrap();
let more_info_el = document.select(&scraper::Selector::parse("div.h5").unwrap()).next().unwrap();
let date = more_info_el.inner_html();
let author_el = more_info_el.select(&scraper::Selector::parse("a").unwrap()).next().unwrap();
let author = author_el.inner_html();
let author_url = author_el.value().attr("href").unwrap();
Self {
name: trim_whitespace(&title),
url: trim_whitespace(url),
album: trim_whitespace(&album),
album_url: trim_whitespace(album_url),
date: parse_date(&date),
author: trim_whitespace(&author),
author_url: trim_whitespace(author_url),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
const EXAMPLES: &'static str =
r#"
<ul>
<li>
<div class="any--weaken-color">
<a class="artist" href="/artists/mucc/">MUCC</a>
</div>
<a class="symbol__release" href="/releases/mucc/54429/shin-sekai-tsuujouban/">Shin Sekai Tsuujouban</a>
<div class="h5 any--no-wrap">
2022-06-16 12:21:00 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/kumika/">kumika</a>
</div>
</li>
<li>
<div class="any--weaken-color">
<a class="artist" href="/artists/lay-about-world/">LAY ABOUT WORLD</a>
</div>
<a class="symbol__release" href="/releases/lay-about-world/37128/c-lone/">c×lone </a>
<div class="h5 any--no-wrap">
2022-06-16 11:39:52 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/kumika/">kumika</a>
</div>
</li>
</ul>
"#;
#[test]
fn extract_all_must_return_two_result() {
let document = scraper::Html::parse_fragment(EXAMPLES);
let artists = Release::extract_all(document);
assert_eq!(artists.len(), 2);
assert_eq!(artists[0].name, "MUCC");
assert_eq!(artists[0].url, "/artists/mucc/");
assert_eq!(artists[0].album, "Shin Sekai Tsuujouban");
assert_eq!(artists[0].album_url, "/releases/mucc/54429/shin-sekai-tsuujouban/");
assert_eq!(artists[0].date, "2022-06-16 12:21:00");
assert_eq!(artists[0].author, "kumika");
assert_eq!(artists[0].author_url, "/users/kumika/");
assert_eq!(artists[1].name, "LAY ABOUT WORLD");
assert_eq!(artists[1].url, "/artists/lay-about-world/");
assert_eq!(artists[1].album, "c×lone");
assert_eq!(artists[1].album_url, "/releases/lay-about-world/37128/c-lone/");
assert_eq!(artists[1].date, "2022-06-16 11:39:52");
assert_eq!(artists[1].author, "kumika");
assert_eq!(artists[1].author_url, "/users/kumika/");
}
const EXAMPLE: &'static str = r#"
<div class="any--weaken-color">
<a class="artist" href="/artists/mucc/">MUCC</a>
</div>
<a class="symbol__release" href="/releases/mucc/54429/shin-sekai-tsuujouban/">Shin Sekai Tsuujouban</a>
<div class="h5 any--no-wrap">
2022-06-16 12:21:00 by
<a class="user a--inherit" data-icon="" data-is-vip="1" href="/users/kumika/">kumika</a>
</div>
"#;
#[test]
fn extract_must_be_return_correct_value() {
let document = scraper::Html::parse_fragment(EXAMPLE);
let artist = Release::extract(document);
assert_eq!(artist.name, "MUCC");
assert_eq!(artist.url, "/artists/mucc/");
assert_eq!(artist.album, "Shin Sekai Tsuujouban");
assert_eq!(artist.album_url, "/releases/mucc/54429/shin-sekai-tsuujouban/");
assert_eq!(artist.date, "2022-06-16 12:21:00");
assert_eq!(artist.author, "kumika");
assert_eq!(artist.author_url, "/users/kumika/");
}
}

7
src/main.rs Normal file
View file

@ -0,0 +1,7 @@
mod extractor;
fn main() {
dotenvy::dotenv().unwrap();
extractor::extract();
}