From 42d555663043785effb763a70f831ccfef7a6c5b Mon Sep 17 00:00:00 2001 From: stacksmith Date: Fri, 4 Mar 2022 17:37:21 -0500 Subject: [PATCH] initial --- Makefile | 51 ++++++++ README.md | 36 ++++++ comm.c | 66 ++++++++++ db.c | 155 ++++++++++++++++++++++++ db.h | 6 + gemtext.c | 165 +++++++++++++++++++++++++ global.h | 14 +++ log.c | 208 ++++++++++++++++++++++++++++++++ log.h | 5 + lookup.c | 33 +++++ main.c | 87 ++++++++++++++ sigil.c | 99 +++++++++++++++ sigil.h | 5 + url.c | 350 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ url.h | 27 +++++ 15 files changed, 1307 insertions(+) create mode 100644 Makefile create mode 100644 README.md create mode 100644 comm.c create mode 100644 db.c create mode 100644 db.h create mode 100644 gemtext.c create mode 100644 global.h create mode 100644 log.c create mode 100644 log.h create mode 100644 lookup.c create mode 100644 main.c create mode 100644 sigil.c create mode 100644 sigil.h create mode 100644 url.c create mode 100644 url.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d61c03b --- /dev/null +++ b/Makefile @@ -0,0 +1,51 @@ +# http://creativecommons.org/publicdomain/zero/1.0/ +bin := ~/bin +SPADIR := $(HOME)/.spa +logfile := '"$(SPADIR)/spa.log"' +map20file := '"$(SPADIR)/spa.map20"' + +all: link + +gemtext.o: gemtext.c global.h + gcc -Wall -c gemtext.c + +log.o: log.c global.h + gcc -Wall -c -DDBFILE_LOG=$(logfile) -DDBFILE_MAP=$(map20file) log.c + +sigil.o: sigil.c global.h + gcc -Wall -c sigil.c + +db.o: db.c global.h log.h + gcc -c -Wall -DDBFILE_LOG=$(logfile) -DDBFILE_MAP=$(map20file) db.c + +comm.o: comm.c global.h + gcc -Wall -c comm.c + +url.o: url.c global.h + gcc -Wall -c url.c + +main.o: main.c global.h + gcc -Wall -c main.c + +link: main.o db.o comm.o url.o sigil.o gemtext.o log.o + gcc -Wall -o spa main.o db.o comm.o url.o sigil.o gemtext.o log.o + strip spa + mv spa $(bin)/ + +lookup.o: lookup.c db.h global.h sigil.h + gcc -Wall -c lookup.c + +lookup: lookup.o db.o log.o sigil.o + gcc -Wall -o lookup lookup.o db.o log.o sigil.o + mv lookup $(bin)/ + + +clean: + rm -f *~ *# *.o spa + rm $(bin)/spa + +wipe: + rm -f $(SPADIR)/spa.log + rm -f $(SPADIR)/spa.map20 + truncate -s 1K $(SPADIR)/spa.log + truncate -s 4M $(SPADIR)/spa.map20 diff --git a/README.md b/README.md new file mode 100644 index 0000000..2ab8b93 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# spa - a Spartan Client powered by HorseWare + +WIP! + +`spa` is an experimental, terminal, client for the [Spartan Protocol](https://portal.mozz.us/spartan/spartan.mozz.us/), a minimized, unencrypted version of the Gemini protocol. + +`spa` is built as a proof of concept and a testbed for a slightly off-the-beaten path technology inspired by Whinam methodologies. + +On the surface it is just a simplistic browser. Internally, an immutable logbase tracks every spartan URL ever encountered, assigning 4-character sigils for display and access. Sigils may be used instead of the URLs they represent, and are much easier to memorize, type in, or otherwise keep track of. The database acts as a URL-shortener, a bookmark system, and a history log. + +## Usage + +`spa ` to visit a spartan URL +`spa ` to visit a sigil + +Currently, the output is sent to the terminal, after generating appropriate sigils. Further navigation using sigils is now possible by re-invoking spa. + +### Sigils +Sigils are 4-character mnemonics assigned for each URL visited or rendered by `spa`. Each character may be an uppercase alpha character (but not 'O' or 'I') or a digit (but not '0' or '1'). This allows for 32 possibilities representing 5 bits; 4 such characters provide for 20 bits, or 1M (1048576) possible URLs (sufficient for Spartan!) + + +## Requirements +A linux machine with a GCC compiler. + +## Resources +The compiled binary is < 20K. +Upon installation, a db.log file is created in the `.spa` directory. It will contain every spartan URL seen by the system. Each url is stored once and only once. In addition, db.map20 file is created containing an index for fast lookup (hashtable). The file is fixed at 4MB, but it is a sparse file and will generally take up substantially less (and never more) space. + +### Installing +Modify the Makefile to set your build folder, the folder into which spa will be placed (I keep a ~/bin directory which is in my PATH). + +Create a .spa directory in your HOME (unless you want it elsewhere, but change the Makefile accordingly); + +`make` to build spa and move it to the specified bin directory. + +`make wipe` to create the log and index files in the .spa directory. diff --git a/comm.c b/comm.c new file mode 100644 index 0000000..9258afd --- /dev/null +++ b/comm.c @@ -0,0 +1,66 @@ +/* + spb - a very simple toy spartan client +*/ +#include //printf +#include //strlen +#include //socket +#include //inet_addr +#include +#include +#include "global.h" + +char req[1000]; +char reply[0x10000]; + +long hostname_to_ip(char * hostname){ + struct hostent *he; + struct in_addr **addr_list; + + if ( (he = gethostbyname( hostname ) ) == NULL) { + herror("gethostbyname"); + return 0; + } + addr_list = (struct in_addr **) he->h_addr_list; + // printf("hostname_to_ip: ok; %d bytes\n",he->h_length); + //printf("0: %08lX\n",*(long*)addr_list[0]); + return *(long*)addr_list[0]; +} + + +struct sockaddr_in server; + +int get(FILE* f,char* host, U32 port,char* path){ + + int sock = socket(AF_INET , SOCK_STREAM , 0); + if (sock == -1){ + printf("Could not create socket\n"); + return -1; + } + + + if(!(server.sin_addr.s_addr = hostname_to_ip(host))) + return -2; + server.sin_family = AF_INET; + server.sin_port = htons(port?port:300); + + if (connect(sock , (struct sockaddr*)&server , sizeof(struct sockaddr_in)) < 0) { + perror("Could not connect\n"); + return -3; + } + + sprintf(req,"%s %s 0\r\n",host, path); + + //Send some data + if( send(sock , req , strlen(req) , 0) < 0) { + puts("Send failed"); + return -4; + } + + //Receive a reply from the server + ssize_t len; + while( (len = recv(sock, reply, sizeof(reply), 0)) > 0){ + fwrite(reply,len,1,f); + } + + return 0; +} diff --git a/db.c b/db.c new file mode 100644 index 0000000..f44fa13 --- /dev/null +++ b/db.c @@ -0,0 +1,155 @@ +#include + + + +#include +#include +#include +#include +#include + +#include +#include + +#include "global.h" +#include "sigil.h" +#include "log.h" +/* + +The database consists of two files; +* An immutable log with timestamped entries of variable size (URLs); +* An index mapping 20-bit values to corresponding log positions + + + Terminology: + * LOG * - an append-only log containing: + - :<0> + - /<0> + + * MAP * - a memory-mapped array of 1M 8-byte entries, used as a hashtable + + * IDX * - a 20-bit index into MAP + + * OFFSET * - an offset into LOG (for URLs); + + * NICK * - a 4-character mnemonic string convertible to an IDX + +An entry stored in MAP contains an offset to a URL, and some metadata. The +index of an entry is hashed from the URL (linear probing used on collisions). +A NICK may be used to communicate the IDX. + */ + +#define IDX_ENTRY_SIZE 4 +#define tIDX_ENTRY U32 +// Index size is 20 bits... +#define IDX_SIZE (0x100000 * IDX_ENTRY_SIZE) +#define IDX_MASK 0xFFFFF +// Each entry contains an offset into log +#define OFF_MASK 0x7FFFFFFF + +/******************************************************************************* + +LOG + +Current log state is kept here. + +TIMESTAMPS +========== + +<:><0> + +The log contains timestamp records interleaved with other records. As a matter +of policy, not every record is preceded with a stamp. + +Currently, the policy is: +* Any 1K run of log data must contain at least one stamp; +* If the previous stamp is more than 59 seconds in the past, a stamp now. + +To enforce this policy, the application keeps a variable last_stamp, a 64-bit +value containing: +| 0 - 31 | last-stamp-value | +| 32- 63 | last stamp file position | + + ******************************************************************************/ +#ifdef ERRLOG + extern FILE* errlog; +#endif + + + +tIDX_ENTRY* map; +/******************************************************************************* + + + + ******************************************************************************/ + +/******************************************************************************* + + + + ******************************************************************************/ +int sys_open(){ + log_open(); + int fd = open(DBFILE_MAP,O_RDWR); + if(fd == -1) { + printf("unable to open map file %s\n",DBFILE_MAP); + return -1; + } + // map to memory + map = (tIDX_ENTRY*) mmap(NULL,IDX_SIZE,PROT_READ|PROT_WRITE,MAP_SHARED,fd,0); + close(fd); + + if(MAP_FAILED==map){ + printf("mmap failed\n"); + return -2; + } else { + return 0; + } +} +/******************************************************************************* + + + + ******************************************************************************/ +void sys_close(){ + log_close(); + munmap(map,IDX_SIZE); +} +/******************************************************************************* + + Given a URL, return an idx of the interned URL. + + ******************************************************************************/ +U32 URL_idx(char* str){ + U32 idx = string_hash(str); + + // Linear-probe + char buf[256]; + U32 off; + while((off=map[idx])){ // as long as a map exists for hash, + log_read(off,buf); // look at what it refers to, and + if(!strcmp(str,buf)) // see if it is our needle. If so, + return idx; // we already have it, return it. + idx = (idx+1) & IDX_MASK; // otherwise, linear-probe next. + } + // not found? create. idx points at a 0 + U32 pos = log_write(str); + map[idx]= pos; + return idx; +} +/******************************************************************************* + + + + ******************************************************************************/ +U32 idx_URL(U32 idx,char* buf){ + U32 offset = map[idx]; + //printf("db.c: offset:%X",offset); + + if(offset>0) + return log_read(offset,buf); + else + return 0; +} + diff --git a/db.h b/db.h new file mode 100644 index 0000000..32c2f6f --- /dev/null +++ b/db.h @@ -0,0 +1,6 @@ +int sys_open(); +void sys_close(); +void idx_sigil(U32 idx,char*sigil); +U32 sigil_idx(char* sigil); +U32 URL_idx(char* str); +U32 idx_URL(U32 idx,char* buf); diff --git a/gemtext.c b/gemtext.c new file mode 100644 index 0000000..521e9d2 --- /dev/null +++ b/gemtext.c @@ -0,0 +1,165 @@ +#include +#include +#include "global.h" +#include "db.h" +#include "url.h" +#include //exit + +extern sUrl base; //the url being rendered + +char* skip_to_ws(char* p){ + char c; + while((c=*p)) + if((' '==c)||('\t'==c)||('\n'==c)) + break; + else + p++; + return p; +} + +char* skip_ws(char*p){ + char c; + while((c=*p)) + if((' '==c)||('\t'==c)||('\n'==c)) + p++; + else + break; + return p; +} + +U32 url_length(char* url){ + char* end = skip_to_ws(url); + return end-url; +} + +char linebuf[4096]; + +/******************************************************************************* + +render_link_line - Pull out the URL, normalize, and check with the db for the + sigil. Format line for display. + + ******************************************************************************/ +void render_link_line(FILE*out,char*p){ + // printf("[%s]\n",p); + char* u = skip_ws(p+2); + U32 ulen = url_length(u); + + sUrl url; + + + if(url_is_full(u)){ + sUrl_set(&url,u,ulen); + sUrl_full(&url); + } else { + sUrl_copy(&url,&base); + sUrl_file(&url); + // printf("[r[%*.*s]r]",url_len,url_len,url_start); + //printf("[b[%s]b]",base.buf); + sUrl_rel(&url,u,ulen); + // printf("[[%s]]",url.buf); + } + // make a shortcut! + // printf("[[[%s]]]", url.buf); + if(!sUrl_norm(&url)){ + // URL normalized OK. Only spartan urls are databased. + if(!(strncmp("spartan://",url.buf,10))) { + char sigil[5]; + U32 idx = URL_idx(url.buf); + idx_sigil(idx,sigil); + //fprintf(out,"\033[33m%s\033[0m ",sigil); + fprintf(out,"\033[33m%s\033[0m ",sigil); + p = skip_ws(u+ulen); + if(*p) + fprintf(out,"\033[92m%s\033[0m",p); + else { + fprintf(out,"\033[92m%*.*s\033[0m\n",ulen,ulen,u); + } + } else { //non-spartan urls are printed as-is + fprintf(out,"\033[95m???? %s\033[0m",u); + } + } else { //an attempt to manipulate a path + fprintf(out,"\033[91mXXXX %s\033[0m",u); + } + + +} +/******************************************************************************* + +render_head - Render a header line, # ## or ### + + ******************************************************************************/ +void render_head(FILE* out, char*p){ + int level; + for(level=0;*p=='#';p++,level++); + switch(level){ + case 1: fprintf(out,"\033[93m");break; + case 2: fprintf(out,"\033[94;1m");break; + case 3: fprintf(out,"\033[94m");break; + } + fprintf(out,"%s\033[0m",p); +} +/******************************************************************************* + +render_quote - Render a > quote line + + ******************************************************************************/ +void render_quote(FILE* out,char*p){ + fprintf(out,"\033[36m%s\033[0m",p); +} +/******************************************************************************* +render_line - dispatch to a proper line handler. + +*******************************************************************************/ +/* +Return 0 for normal, 1 for block-quote status. +*/ +int render_line(FILE* out,char* p, int blockquote){ + int backquotes = !strncmp("```",p,3); + if(blockquote){ + if(backquotes) + return 0; + else{ + fputs(linebuf,out); + return 1; + } + } else { + if(backquotes) + return 1; + else { + switch(*p){ + case '=': + if('>'==*(p+1)){ + render_link_line(out,p); + return 0; + } + case '#': + render_head(out,p); + return 0; + case '>': + render_quote(out,p); + return 0; + } + fprintf(out,"%s",linebuf); + return 0; + } + } +} + +void render_file1(FILE* in,FILE* out){ + int bq = 0; + while(fgets(linebuf,4096,in)) + bq=render_line(out,linebuf,bq ); +} +/******************************************************************************* + +render_file - Render the entire in file to out file + + ******************************************************************************/ + +void render_file(FILE* in,FILE* out){ + int bq = 0; // tracking multiline-quote state + while(fgets(linebuf,4096,in)) + bq=render_line(out,linebuf,bq ); +} + diff --git a/global.h b/global.h new file mode 100644 index 0000000..1bc0b0e --- /dev/null +++ b/global.h @@ -0,0 +1,14 @@ +#include + +typedef uint8_t U8; +typedef uint16_t U16; +typedef uint32_t U32; +typedef uint64_t U64; + + +/* Application constants */ +//#define DBFILE_LOG "/home/stack/.spa/spa.log" +//#define DBFILE_MAP "/home/stack/.spa/spa.map20" +// tempfiles are dumped into working directory +#define TEMPFILE_GMI "tempfile.gmi" +#define TEMPFILE_OUT "tempfile.out" diff --git a/log.c b/log.c new file mode 100644 index 0000000..01efd3f --- /dev/null +++ b/log.c @@ -0,0 +1,208 @@ +#include + + + +#include +#include +#include +#include +#include + +#include +#include + +#include "global.h" +#include "sigil.h" +/* + +The database consists of two files; +* An immutable log with timestamped entries of variable size (URLs); +* An index mapping 20-bit values to corresponding log positions + + + Terminology: + * LOG * - an append-only log containing: + - :<0> + - /<0> + + * MAP * - a memory-mapped array of 1M 8-byte entries, used as a hashtable + + * IDX * - a 20-bit index into MAP + + * OFFSET * - an offset into LOG (for URLs); + + * NICK * - a 4-character mnemonic string convertible to an IDX + +An entry stored in MAP contains an offset to a URL, and some metadata. The +index of an entry is hashed from the URL (linear probing used on collisions). +A NICK may be used to communicate the IDX. + */ + +#define IDX_ENTRY_SIZE 4 +#define tIDX_ENTRY U32 +// Index size is 20 bits... +#define IDX_SIZE (0x100000 * IDX_ENTRY_SIZE) +#define IDX_MASK 0xFFFFF +// Each entry contains an offset into log +#define OFF_MASK 0x7FFFFFFF + +/******************************************************************************* + +LOG + +Current log state is kept here. + +TIMESTAMPS +========== + +<:><0> + +The log contains timestamp records interleaved with other records. As a matter +of policy, not every record is preceded with a stamp. + +Currently, the policy is: +* Any 1K run of log data must contain at least one stamp; +* If the previous stamp is more than 59 seconds in the past, a stamp now. + +To enforce this policy, the application keeps a variable last_stamp, a 64-bit +value containing: +| 0 - 31 | last-stamp-value | +| 32- 63 | last stamp file position | + + ******************************************************************************/ +#ifdef ERRLOG + extern FILE* errlog; +#endif + +FILE* flog; +U64 last_stamp = 0; + +#define LS_STAMP ((U32)(last_stamp & 0xFFFFFFFF)) +#define LS_POS ((U32)(last_stamp>>32)) +#define STAMP_SIZE 6 + +/******************************************************************************* + +stamp_write Given now, a U32 current time in seconds, and here, the current + position in the file + + ******************************************************************************/ +U32 stamp_write(U32 now,U32 here){ + last_stamp = ((U64)here)<<32 | now; + putc(':',flog); + fwrite(&now,4,1,flog); + putc(0,flog); + return 6; +} +/******************************************************************************* + +stamp_maybe_write Consult the stamp policy and write a stamp when needed + + + ******************************************************************************/ +U32 stamp_maybe_write(U32 here){ + U32 now=time(NULL); + + if( ((now - LS_STAMP) > 59) || // If a minute or more has passed since stamp, + ((here - LS_POS)>(1024-STAMP_SIZE))) + { + //printf("STAMPING\n"); + return stamp_write(now,here); + } else + return 0; +} + +/******************************************************************************* + + + + ******************************************************************************/ +/* log_stamp_of(U32 pos) + +Return stamp (low U32) and position thereof (high U32) + + */ +U64 log_stamp_of(U32 pos){ + char buf[1024]; + FILE* f = fopen(DBFILE_LOG,"r"); + fseek(f,pos-1024,SEEK_SET); // read 1K before pos; + fread(buf,1024,1,f); // stamp guaranteed + fclose(f); + for(int i=1024-6;i>=0;i--){ // scan from end, backward + if((buf[i]==0)&&(buf[i+1]==':')){ // search for '<0>:' + printf("stamp detected at %08x\n",pos-1024+i+1); + return (((U64)(pos-1024+i+2))<<32) | (*(U32*)(buf+i+2)); + } + } + return 0; +} + +/******************************************************************************* + + + + ******************************************************************************/ + +void log_open(){ + flog = fopen(DBFILE_LOG,"a"); + if(!flog){ + perror("fucked"); + } + U32 here = ftell(flog); + last_stamp = log_stamp_of(here); + // printf("HERE: %d Now: %d\n",here,LS_STAMP); +} + +/******************************************************************************* + + + + ******************************************************************************/ +void log_close(){ + fclose(flog); +} + +/******************************************************************************* + + + + ******************************************************************************/ +/* log_write +Write a URL into the log, returning a file position of the entry. +If necessary, precede the entry with a timestamp. + + + */ +U32 log_write(char* url){ + U32 here = ((U32)ftell(flog)); + here += stamp_maybe_write(here); + +#ifdef ERRLOG + fputs(url,errlog); + fputc('\n',errlog); + +#endif + + fputs(url,flog); + + putc(0,flog); // null-term + return here; +} +/******************************************************************************* + + + + ******************************************************************************/ +// log_read read log entry at pos into buf +U32 log_read(U32 pos,char* buf){ + FILE* f = fopen(DBFILE_LOG,"r"); + fseek(f,pos,SEEK_SET); + U32 ret = fread(buf,1,256,f); + fclose(f); + return ret; +} +/******************************************************************************* + + + + ******************************************************************************/ diff --git a/log.h b/log.h new file mode 100644 index 0000000..e12339d --- /dev/null +++ b/log.h @@ -0,0 +1,5 @@ +U64 log_stamp_of(U32 pos); +void log_open(); +void log_close(); +U32 log_write(char* url); +U32 log_read(U32 pos,char* buf); diff --git a/lookup.c b/lookup.c new file mode 100644 index 0000000..7d058ad --- /dev/null +++ b/lookup.c @@ -0,0 +1,33 @@ +#include +#include +#include "global.h" +#include "db.h" + +FILE* errlog; +#undef ERRLOG +int doit(char* req){ + + char buf[1024]; + U32 idx = sigil_idx(req); + int ret = idx_URL(idx,buf); + if(ret) { + + printf("[%s](@%X)%s\n",req,idx*4,buf); + return 0; + } else { + printf("Not Found\n"); + return 1; + } +} +int main(int argc,char*argv[]){ + if(strlen(argv[1])!=4) { + printf("Usage: lookup XXXX, where XXXX is a 4-char sigil\n"); + return 1; + } + + sys_open(); + int ret = doit(argv[1]); + sys_close(); + return ret; +} + diff --git a/main.c b/main.c new file mode 100644 index 0000000..9c4c0a9 --- /dev/null +++ b/main.c @@ -0,0 +1,87 @@ +#include +#include +#include "global.h" +#include "db.h" +#include "url.h" +#include //exit +#define ERRLOG 1 +FILE* errlog; + + + +extern void get(FILE* f,char*host,U32 port,char* path); +extern void render_file(FILE*in,FILE* out); +/* url_path + +Given a mutable full url string, split it, peeling the host. +********/ + + +void fetch_sUrl(FILE* f,sUrl* purl){ + char host[256]; + U32 host_len = sUrl_host_length(purl); + strncpy(host,purl->buf+purl->oHost,host_len); + host[host_len]=0; + char*px = strchr(host,':'); //port? + if(px) *px=0; + + U32 port = sUrl_port(purl); + char* path = sUrl_path(purl); + + // printf("fetching: %s %d %s\n",host,port,path); + get(f,host,port,path); +} +sUrl base; // extern'ed in gemtext.c +int doit(char* req){ + char buf[1024]; + char* p = req; + // If the parameter is exactly 4 long, it is a sigil + if(4==strlen(req)){ + U32 idx = sigil_idx(req); + idx_URL(idx,buf); + printf("idx %4.4X %s\n",idx,buf); + p = buf; + } else { // it must be a full url to start with + if(!url_is_full(req)) + return 1; + URL_idx(req); // check the root URL with the db + } + sUrl_set(&base,p,strlen(p)); + sUrl_full(&base); + // sUrl_dump(&base); + + + FILE* fgmi = fopen(TEMPFILE_GMI,"w+"); + FILE* fout = stdout;//fopen(TEMPFILE_OUT,"w+"); + fetch_sUrl(fgmi,&base); + fseek(fgmi,0,SEEK_SET); + render_file(fgmi,fout); + // fclose(fout); + fclose(fgmi); + + // printf("[%s]\n",URL_dir(argv[1],urlbuf)); + + + return 0; + +} + + +extern int index_create(); + +int main(int argc,char*argv[]){ + printf("ARGC: %d\n",argc); + if(argc != 2){ + printf("spa is an experimental Spartan protocol browser\nUsage:\n spa \n spa \n"); + return -1; + } + errlog = fopen("errlog.txt","a"); + int failed = sys_open(); + if(!failed){ + failed = doit(argv[1]); + } + sys_close(); + fclose(errlog); + return failed; +} + diff --git a/sigil.c b/sigil.c new file mode 100644 index 0000000..117fbd8 --- /dev/null +++ b/sigil.c @@ -0,0 +1,99 @@ +#include +/* +#include +#include +#include +#include +#include + +#include +#include +*/ +#include //toupper + +#include "global.h" +/* SIGILS + +A sigil is shortcut to a URL previously seen by the system and databased. + +In this application, a sigil corresponds to a 20-bit value, encoded as a 4-char +string (using only capital letters and numbers, but not O,I,0,or 1 to avoid +confusion). Sigils are meant to be somewhat easy to recognize. + +Sigils are used as indices into the URL database, resolving as URLS. + +Sigils are generated as FNV1a hashes of URLs provided. + + */ +#define IDX_SIZE (0x100000 * IDX_ENTRY_SIZE) +#define IDX_MASK 0xFFFFF + +#define FNV_OFFSET 14695981039346656037UL +#define FNV_PRIME 1099511628211UL + +// working character map, 32 characters +static char charmap[]="ABCDEFGHJKLMNPQRSTUVWXYZ23456789"; + +/******************************************************************************* + + + + ******************************************************************************/ +/******************************************************************************* + +string_hash - generate a FNV1A hash of 20 bits + + ******************************************************************************/ + +U32 string_hash(const char* str) { + uint64_t hash = FNV_OFFSET; + for (const char* p = str; *p; p++) { + hash ^= (uint64_t)(unsigned char)(*p); + hash *= FNV_PRIME; + } + return hash & IDX_MASK; +} + +/******************************************************************************* + +char_to_c5bit Convert a 5-bit encoded as a character to a 5-bit value + + + ******************************************************************************/ +int char_to_c5bit(int ch){ + int i; + int c = toupper(ch); + for(i=31;i>=0;i--) + if(c==charmap[i]) + break; + return i; +} + +/******************************************************************************* + +idx_sigil Convert a 20-bit idx to a 4-char sigil + + ******************************************************************************/ +void idx_sigil(U32 idx,char* sigil){ + for(int i=15;i>=0;i-=5){ + *sigil++ = charmap[0x1F & (idx>>i)]; + } + *sigil=0; +} +/******************************************************************************* + +sigil_idx Convert a 4-char null-termed sigil to a 20-bit idx + + ******************************************************************************/ +U32 sigil_idx(char* sigil){ + + int result = 0; + for(int i=0;i<4;i++){ + int code = char_to_c5bit(*sigil++); + if(result<0) + return -1; + result = (result << 5) + code; + } + // printf("Converted sigil %s to idx %X\n",sigil,result); + return result; +} diff --git a/sigil.h b/sigil.h new file mode 100644 index 0000000..a8089de --- /dev/null +++ b/sigil.h @@ -0,0 +1,5 @@ +void idx_sigil(U32 idx,char* sigil); +U32 sigil_idx(char* sigil); + + +U32 string_hash(const char* str); diff --git a/url.c b/url.c new file mode 100644 index 0000000..8b8cf22 --- /dev/null +++ b/url.c @@ -0,0 +1,350 @@ +#include +#include +#include "global.h" +#include "url.h" +/* + [scheme[p://] [host[:port]] [/path] [?query] + +*/ + +/******************************************************************************* + +URL processing + +URLs start their life as strings from the user or links embedded in gmi files. +sUrl structure encapsulate a local copy of the URL and facilitate normalization. + +An sUrl may be initialized in one of the following ways: +* copied from another sUrl; +* set from a full sUrl; +* set from a partial url using another, base sUrl. + +It is processed in order to set internal component offsets. + +It is normalized to eliminate . and .. components. + +Notes: + +port is tracked as an actual port and not an offset! + +oFile is used only for base sUrls, and is normally not searched/initialized. + + ******************************************************************************/ +int url_is_spartan(char* url){ + return !(strncmp("spartan://",url,10)); +} + +int url_is_full(char* url){ + return (NULL != strstr(url,"://")); +} + +char* sUrl_path(sUrl* purl){ + return purl->buf + purl->oPath; +} + +U16 sUrl_port(sUrl* purl){ + return purl->port; +} + +/******************************************************************************* + +print the sUrl + + ******************************************************************************/ +void sUrl_dump(sUrl* purl){ + printf("sUrl at %p\n",purl); + int ohost = purl->oHost; + int opath = purl->oPath; + char* buf = purl->buf; + + if(opath){ + printf("scheme: %*.*s\n",ohost,ohost,buf); + printf("host: %*.*s\n",opath-ohost,opath-ohost,buf+ohost); + printf("port: %d\n",purl->port); + printf("path: %s\n",buf+opath); + } else { + printf("invalid:[%s]\n",buf); + } +} +// fputs(purl->buf,f); + + +/******************************************************************************* + +Set the sUrl to the string fragment provided. The sUrl is reset and needs +further processing! + + ******************************************************************************/ +int sUrl_set(sUrl*p,char*str,U32 len){ + if(!len) + len = strlen(str); + if (len > URL_MAX) return 1; // didn't fit + p->oHost = 0; + p->oPath = 0; + p->oFile = 0; + p->port = 300; + strncpy(p->buf,str,len); + p->buf[len]=0; + return 0; +} +/******************************************************************************* + +Copy the src sUrl into the destingation sUrl, along with all components. + + ******************************************************************************/ +sUrl* sUrl_copy(sUrl* dst, sUrl* src){ + memcpy(dst,src,sizeof(sUrl)); + return dst; +} +/******************************************************************************* + +file Return the pointer to the file component of this URL, and set the internal +offset. + + ******************************************************************************/ +char* sUrl_file(sUrl*p){ + char* pfile = strrchr(p->buf,'/') + 1; + p->oFile = pfile - (p->buf); + return pfile; +} + + +U32 host_port(char* p){ + char c; + do { + c=*p++; + if(!c) return 300; // at end? default + if('/'==c) return 300; // at /? default + if(':'==c) break; // at : parse + } while (1); + + U32 val = 0; + + while((c=*p++)) { + c-= '0'; + if( (c<0) || (c>9)) + return 0; //invalid port + val = (val * 10) + c; + } + + return val; +} +/******************************************************************************* + +full - parse the buffer of this sUrl to set the internal offsets (but not file), + assuming a full URL (it must have a ://). + +Return: 0 on success; 1 if not a full URL. + + ******************************************************************************/ +/* url_do_full -- Try to parse a full URL */ +// +int sUrl_full(sUrl* purl){ + char* p = strstr(purl->buf,"://"); //if scheme is there it ends with spartan|://|host|/| + if(!p) return 1; //no scheme sigil = error! + p+=3; + + U32 port = host_port(p); + if(!p) return 1; //invalid port + purl->port = port; + + purl->oHost = p-(purl->buf); + char* path = strchr(p+3,'/'); //host ends with a /path + if(!path) { // if no path provided, + path = p + 3 + (strlen(p+3)); + *path = '/'; + *(path+1)=0; + } + purl->oPath = path - purl->buf; + return 0; +} +/******************************************************************************* + +rel - given a base sUrl and a string with local url, mutate sUrl into a full + rul + + ******************************************************************************/ +void sUrl_rel(sUrl*base,char* str,U32 len){ + char* dst = base->buf + (('/'==*str) ? base->oPath : base->oFile); + strncpy(dst,str,len); + dst[len]=0; +} +/******************************************************************************* + +norm - given a full but not canonical sUrl, process to eliminate ./ and ../ + +return: 0 success + 1 error + ******************************************************************************/ +// backup from p to the previous '/' +char* _norm_backup(char* p,char* start){ + while(--p >= start) + if('/'==*p) + return p; + return NULL; +} + +int sUrl_norm(sUrl* purl){ + char* start = purl->buf + purl->oPath; + char* src = start; + char* dst = start; + + //printf("normalizing[%s]\n",src); + char c; + while((c = (*dst++ = *src++))){ + // printf("c=%c[%s]\n",c,src-1); + if('/'==c){ + if(!strncmp(src,"../",3)){ + if(!(dst = _norm_backup(dst-1,start))) + return 1; + else + src += 2; //src and dest at respecive '/'s + + } else if(!strncmp(src,"./",2)) { + src += 2; + } else if(!strcmp(src,"..")){ + if(!(dst = _norm_backup(dst-1,start))) + return 2; + dst++; // leave the slash + src += 2; + *dst++=0; + // printf("src:[%s]; dst:[%s];\n",src,dst); + } else if(!strcmp(src,".")){ + *dst++=0; + } + } + } + *start = '/'; + + // printf("postnorm[%s]\n",start); + return 0; +} +/******************************************************************************* + + + ******************************************************************************/ + +U32 sUrl_host_length(sUrl* purl){ + return (purl->oPath - purl->oHost); +} + +/* +char URL_base[1024]; +int baselen; +int bhost_off; +int bpath_off; +*/ +/* +char buf[2048]; +char* scheme; +char* host; +char* path; + + +int url_make_base(){ + strcpy(URL_base,scheme); + baselen = strlen(URL_base); + bhost_off = host - scheme; + bpath_off = path - scheme; + return 0; +} + + + +int url_do_part(char*src){ + scheme = src-baselen; + strncpy(scheme,URL_base,baselen); + host = scheme + bhost_off; + path = scheme + bpath_off; + return 0; +} +*/ +/* url_do_full -- Try to parse a full URL */ +/* +int url_do_full(char*src){ + char* p = strstr(src,"://"); //if scheme is there it ends with spartan|://|host|/| + if(!p) + return 1; //no scheme sigil = error! + + scheme = src; //scheme at start + host = p+3; //host starts after :// + path = strchr(host,'/'); //host ends with a /path + if(!path) { // if no path provided, + int i = strlen(host); // append a '/' and make one. + path = host+i; + *path='/'; + *(path+1)=0; + } + return 0; +} + +//int url_is_full(char*src){ + + + +char* path_norm(char* path){ + *(path-1) = 0; + char* src = path; + char* dst = path; + char c; + while((c=(*dst++ = *src++))){ + if('/'==c){ + if(!strncmp(src,"../",3)){ + if(!(dst = path_backup(dst-1))) + return NULL; + src += 2; + } else if(!strncmp(src,"./",2)) { + src += 2; + } else if(!strcmp(src,"..")){ + if(!(dst = path_backup(dst-1))) + return NULL; + dst++; // leave the slash + src += 2; + // printf("src:[%s]; dst:[%s];\n",src,dst); + } else if(!strcmp(src,".")){ + src++; + } + } + } + return path; +} + +*/ + /* +int main(int argc,char**argv) +{ + sUrl base; + sUrl_set(&base,"spartan://192.168.0.2:8888/servlet/rece/holo"); + sUrl_full(&base); + printf("FILE: %s\n",sUrl_file(&base)); + + + sUrl url; + sUrl_copy(&url, &base); + sUrl_rel(&url,argv[1]); + + int q = sUrl_norm(&url); + printf("normalize returned %d\n",q); + sUrl_print(&url); + return 0; + // strcpy(URL_base,"spartan://192.168.0.2:8888/servlet/rece/"); + //baselen = strlen(URL_base); + //bhost_off = 10; + //bpath_off = 26; + */ + /* + memset(buf,0,2048); + strcpy(buf+256,argv[1]); + q = url_do_full(buf+256); + int lScheme = (int)(host-scheme); + printf("Returned %d\n",q); + printf("scheme: %*.*s\n",lScheme,lScheme,scheme); + printf("host: %s\n",host); + printf("path: %s\n",path); + */ + + // char* p = path_norm(path); + //printf("norm: %s\n",p); + +//} + diff --git a/url.h b/url.h new file mode 100644 index 0000000..e9c5a0b --- /dev/null +++ b/url.h @@ -0,0 +1,27 @@ +#define URL_MAX 2044 +typedef struct sUrl { + U8 oHost; + U8 oPath; + U16 oFile; + U16 port; // _ACTUAL_ port number specified in the URL, + U16 res; + char buf[URL_MAX]; +} sUrl; + + +int url_is_spartan(char* url); +int url_is_full(char* url); +U32 host_port(char* host); + +void sUrl_dump(sUrl* purl); + +int sUrl_set(sUrl*p,char*str,U32 len); +sUrl* sUrl_copy(sUrl* dst, sUrl* src); +char* sUrl_file(sUrl* purl); +int sUrl_full(sUrl* purl); +void sUrl_rel(sUrl*purl,char* str,U32 len); +int sUrl_norm(sUrl* purl); + +char* sUrl_path(sUrl* purl); +U16 sUrl_port(sUrl* purl); +U32 sUrl_host_length(sUrl* purl);