Compare commits

...

3 Commits

Author SHA1 Message Date
lolcat bbc5a89779 removed these fucking files AGAIN 2024-03-24 22:47:19 -04:00
lolcat fea09d9580 fag protection 2 2024-03-24 22:36:20 -04:00
lolcat 2976c0a6a4 fag protection 2024-03-24 22:31:19 -04:00
12 changed files with 80 additions and 284 deletions

View File

@ -1,20 +0,0 @@
<?php
if(!isset($_GET["s"])){
http_response_code(404);
header("X-Error: No SOUND(s) provided!");
die();
}
include "data/config.php";
include "lib/curlproxy.php";
$proxy = new proxy();
try{
$proxy->stream_linear_audio($_GET["s"]);
}catch(Exception $error){
header("X-Error: " . $error->getMessage());
}

View File

@ -1,224 +0,0 @@
<?php
include "data/config.php";
new sc_audio();
class sc_audio{
public function __construct(){
include "lib/curlproxy.php";
$this->proxy = new proxy();
if(isset($_GET["u"])){
/*
we're now proxying audio
*/
$viewkey = $_GET["u"];
if(!isset($_GET["r"])){
$this->do404("Ranges(r) are missing");
}
$ranges = explode(",", $_GET["r"]);
// sanitize ranges
foreach($ranges as &$range){
if(!is_numeric($range)){
$this->do404("Invalid range specified");
}
$range = (int)$range;
}
// sort ranges (just to make sure)
sort($ranges);
// convert ranges to pairs
$last = -1;
foreach($ranges as &$r){
$tmp = $r;
$r = [$last + 1, $r];
$last = $tmp;
}
$browser_headers = getallheaders();
// get the requested range from client
$client_range = 0;
foreach($browser_headers as $key => $value){
if(strtolower($key) == "range"){
preg_match(
'/bytes=([0-9]+)/',
$value,
$client_regex
);
if(isset($client_regex[1])){
$client_range = (int)$client_regex[1];
}else{
$client_range = 0;
}
break;
}
}
if(
$client_range < 0 ||
$client_range > $ranges[count($ranges) - 1][1]
){
// range is not satisfiable
http_response_code(416);
header("Content-Type: text/plain");
die();
}
$rng = null;
for($i=0; $i<count($ranges); $i++){
if($ranges[$i][0] <= $client_range){
$rng = $ranges[$i];
}
}
// proxy data!
http_response_code(206); // partial content
header("Accept-Ranges: bytes");
header("Content-Range: bytes {$rng[0]}-{$rng[1]}/" . ($ranges[count($ranges) - 1][1] + 1));
$viewkey =
preg_replace(
'/\/media\/([0-9]+)\/[0-9]+\/[0-9]+/',
'/media/$1/' . $rng[0] . '/' . $rng[1],
$viewkey
);
try{
$this->proxy->stream_linear_audio(
$viewkey
);
}catch(Exception $error){
$this->do404("Could not read stream");
}
die();
}
/*
redirect user to correct resource
we need to scrape and store the byte positions in the result URL
*/
if(!isset($_GET["s"])){
$this->do404("The URL(s) parameter is missing");
}
$viewkey = $_GET["s"];
if(
preg_match(
'/soundcloud\.com$/',
parse_url($viewkey, PHP_URL_HOST)
) === false
){
$this->do404("This endpoint can only be used for soundcloud streams");
}
try{
$json = $this->proxy->get($viewkey)["body"];
}catch(Exception $error){
$this->do404("Curl error: " . $error->getMessage());
}
$json = json_decode($json, true);
if(!isset($json["url"])){
$this->do404("Could not get URL from JSON");
}
$viewkey = $json["url"];
$m3u8 = $this->proxy->get($viewkey)["body"];
$m3u8 = explode("\n", $m3u8);
$lineout = null;
$streampos_arr = [];
foreach($m3u8 as $line){
$line = trim($line);
if($line[0] == "#"){
continue;
}
if($lineout === null){
$lineout = $line;
}
preg_match(
'/\/media\/[0-9]+\/([0-9]+)\/([0-9]+)/',
$line,
$matches
);
if(isset($matches[0])){
$streampos_arr[] = [
(int)$matches[1],
(int)$matches[2]
];
}
}
if($lineout === null){
$this->do404("Could not get stream URL");
}
$lineout =
preg_replace(
'/\/media\/([0-9]+)\/[0-9]+\/[0-9]+/',
'/media/$1/0/0',
$lineout
);
$streampos = [];
foreach($streampos_arr as $pos){
$streampos[] = $pos[1];
}
$streampos = implode(",", $streampos);
header("Location: audio_sc?u=" . urlencode($lineout) . "&r=$streampos");
header("Accept-Ranges: bytes");
}
private function do404($error){
http_response_code(404);
header("Content-Type: text/plain");
header("X-Error: $error");
die();
}
}

View File

@ -23,17 +23,13 @@ class config{
// Enable the API?
const API_ENABLED = true;
// Bot protection
// 4get.ca has been hit with 500k bot reqs every single day for months
// you probably want to enable this if your instance is public...
// 0 = disabled
// 1 = ask for image captcha (requires imagemagick v6 or higher)
// @TODO: 2 = invite only (users needs a pass)
const BOT_PROTECTION = 0;
//
// BOT PROTECTION
//
// Maximal number of searches per captcha key/pass issued. Counter gets
// reset on every APCU cache clear (should happen once a day)
const MAX_SEARCHES = 100;
// 0 = disabled, 1 = ask for image captcha, @TODO: 2 = invite only (users needs a pass)
// VERY useful against a targetted attack
const BOT_PROTECTION = 0;
// if BOT_PROTECTION is set to 1, specify the available datasets here
// images should be named from 1.png to X.png, and be 100x100 in size
@ -45,6 +41,32 @@ class config{
// ["minecraft", 848]
];
// If this regex expression matches on the user agent, it blocks the request
// Not useful at all against a targetted attack
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i';
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
// Useful for blocking *some* proxies used for botting
const FILTERED_HEADER_KEYS = [
"x-forwarded-for",
"x-via",
"forwarded-for",
"via"
];
// @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found
// Requires the nmap package
const NMAP_PROXY_CHECK = false;
// @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ?
const PUBLIC_IP_BLACKLIST = true;
// Maximal number of searches per captcha key/pass issued. Counter gets
// reset on every APCU cache clear (should happen once a day).
// Only useful when BOT_PROTECTION is NOT set to 0
const MAX_SEARCHES = 100;
// List of domains that point to your servers. Include your tor/i2p
// addresses here! Must be a valid URL. Won't affect links placed on
// the homepage.

View File

@ -8,6 +8,9 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
4. The captcha font is located in `data/fonts/captcha.ttf`
## Robots.txt
Make sure you configure this right to optimize your search engine presence! Head over to `/robots.txt` and change the 4get.ca domain to your own domain.
## Server listing
To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it.
@ -32,4 +35,4 @@ If you see spammy entries in your instances list, simply remove the instance fro
Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection!
### Important!
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead.
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. Hopefully this tip can save you 3 hours of your life!

View File

@ -29,7 +29,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "images");
$frontend->drawscrapererror($error->getMessage(), $get, "images", $payload["timetaken"]);
}
if(count($results["image"]) === 0){

View File

@ -44,7 +44,7 @@ class frontend{
$replacements["timetaken"] !== null
){
$replacements["timetaken"] = '<div class="timetaken">Took ' . substr(microtime(true) - $replacements["timetaken"], 0, 4) . 's</div>';
$replacements["timetaken"] = '<div class="timetaken">Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's</div>';
}
$handle = fopen("template/{$template}", "r");
@ -84,29 +84,54 @@ class frontend{
"filters" => $this->generatehtmlfilters($filters, $get)
]);
$headers_raw = getallheaders();
$header_keys = [];
$user_agent = "";
$bad_header = false;
foreach($headers_raw as $headerkey => $headervalue){
$headerkey = strtolower($headerkey);
if($headerkey == "user-agent"){
$user_agent = $headervalue;
continue;
}
// check header key
if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){
$bad_header = true;
break;
}
}
if(
preg_match(
'/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i',
$_SERVER["HTTP_USER_AGENT"]
)
config::HEADER_REGEX,
$user_agent
) ||
$bad_header === true
){
// bot detected !!
apcu_inc("captcha_gen");
$null = null;
$this->drawerror(
"Tshh, blocked!",
'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running <a href="https://git.lolcat.ca/lolcat/4get" rel="noreferrer nofollow">your own 4get instance</a>.',
'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please <a href="/about">contact the administrator</a>.',
microtime(true)
);
die();
}
}
public function drawerror($title, $error){
public function drawerror($title, $error, $timetaken){
echo
$this->load("search.html", [
"timetaken" => null,
"timetaken" => $timetaken,
"class" => "",
"right-left" => "",
"right-right" => "",
@ -119,7 +144,7 @@ class frontend{
die();
}
public function drawscrapererror($error, $get, $target){
public function drawscrapererror($error, $get, $target, $timetaken){
$this->drawerror(
"Shit",
@ -131,7 +156,8 @@ class frontend{
'<li>Remove keywords that could cause errors</li>' .
'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' .
'</ul><br>' .
'If the error persists, please <a href="/about">contact the administrator</a>.'
'If the error persists, please <a href="/about">contact the administrator</a>.',
$timetaken
);
}
@ -483,10 +509,6 @@ class frontend{
$archives[] = "warosu.org";
break;
case "cm":
$archives[] = "boards.fireden.net";
break;
case "f":
$archives[] = "archive.4plebs.org";
break;
@ -503,12 +525,10 @@ class frontend{
break;
case "v":
$archives[] = "boards.fireden.net";
$archives[] = "arch.b4k.co";
break;
case "vg":
$archives[] = "boards.fireden.net";
$archives[] = "arch.b4k.co";
break;
@ -579,7 +599,6 @@ class frontend{
break;
case "sci":
$archives[] = "boards.fireden.net";
$archives[] = "warosu.org";
$archives[] = "eientei.xyz";
break;
@ -614,7 +633,6 @@ class frontend{
break;
case "ic":
$archives[] = "boards.fireden.net";
$archives[] = "warosu.org";
break;
@ -741,10 +759,6 @@ class frontend{
$archives[] = "desuarchive.org";
break;
case "y":
$archives[] = "boards.fireden.net";
break;
case "t":
$archives[] = "archiveofsins.com";
break;
@ -802,7 +816,7 @@ class frontend{
$payload .=
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
'<a href="https://archive.is/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .

View File

@ -73,7 +73,7 @@ class fuckhtml{
$attributes = [];
preg_match_all(
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/',
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i',
$starting_tags[2][$i][0],
$regex_attributes
);
@ -88,7 +88,7 @@ class fuckhtml{
continue;
}
$attributes[$regex_attributes[1][$k]] =
$attributes[strtolower($regex_attributes[1][$k])] =
trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
}

View File

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "music");
$frontend->drawscrapererror($error->getMessage(), $get, "music", $payload["timetaken"]);
}
$categories = [

View File

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "news");
$frontend->drawscrapererror($error->getMessage(), $get, "news", $payload["timetaken"]);
}
/*

View File

@ -654,6 +654,7 @@ class google{
throw new Exception("Failed to get HTML");
}
//$html = file_get_contents("scraper/google.html");
}

View File

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "videos");
$frontend->drawscrapererror($error->getMessage(), $get, "videos", $payload["timetaken"]);
}
$categories = [

View File

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "web");
$frontend->drawscrapererror($error->getMessage(), $get, "web", $payload["timetaken"]);
}
/*