Skip to content

Commit

Permalink
sync
Browse files Browse the repository at this point in the history
  • Loading branch information
mountaineerbr committed Nov 17, 2024
1 parent be4388e commit 5f29583
Show file tree
Hide file tree
Showing 51 changed files with 34,730 additions and 0 deletions.
32 changes: 32 additions & 0 deletions PMWMT/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#Poor Man's Webmaster Tools

![ScreenShot](logo_ssc.jpg)

## Silly Software Company

-=oOo=-


A poor man's way of doing things
is still a way to get things done


A poor man's website will often consist of a collection of static html files that get uploaded to a webserver under an el cheapo internet account, and lack any of the tools associated with real web hosting and content management systems.

The Silly Software Company fixes this problem by offering you the
[Poor Man's Webmaster Tools](https://web.archive.org/web/20200215004706/http://users.telenet.be/mydotcom/howto/www/tools.htm),
a collection of VB script and/or Bash shell scripts to automate common (and less common) web master tasks.


-------

Some original scripts and my own modifications.

<http://users.telenet.be/mydotcom/sillysof/index.htm>

<http://users.telenet.be/mydotcom/howto/www/tools.htm>


Wayback machine copy:

<https://web.archive.org/web/20200215004706/http://users.telenet.be/mydotcom/sillysof/index.htm>
27 changes: 27 additions & 0 deletions PMWMT/bulktext.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
## find files that don't have 'webstat' text in it
# grep, recursive, list non-matching --> files with .htm extension only --> list in 'targets' file
#http://users.telenet.be/mydotcom/program/shell/textprocess.htm


grep -L -R "webstat" /home/me/website | grep ".htm" > targets.lst

## review and edit target list (remove files that don't need changing)
vim targets.lst

## read file list and process files therein
cat targets.lst | while read filename ; do
# remove /body and /html tags at end of file so insertion doesn't fall ouside html document body
sed -i 's/<\/html>//g' $filename
sed -i 's/<\/body>//g' $filename
sed -i 's/<\/HTML>//g' $filename
sed -i 's/<\/BODY>//g' $filename

# insert text from a file (eg the webstat counter script)
cat srcfile >> $filename

# insert body and html end tags again
echo " </body>" >> $filename
echo "</html>" >> $filename
done

67 changes: 67 additions & 0 deletions PMWMT/deadlinks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
# (c) Koen Noens 2009
#
# Silly Software Productions - Poor Man's Webmaster Tools
#
# find dead links in a set of html files
# only checks the off-site links, identified by absolute url href
#http://users.telenet.be/mydotcom/howto/www/deadlinks.html

## PARAMS
#directory for search
LOCALDIR="websites/mydotcom"

#file to list pages that contain dead links
UPLOADFILE="websites/mydotcom/upload_after_fix"
TODOLIST="websites/mydotcom/deadlinkslist.$( date +%F )"

TMPFILE=$(mktemp)



### Main -- rather verbose so we have progress indication
echo -e "\n\n collecting hyperlinks in $LOCALDIR \n\n"

# find files and hyperlinks
find $LOCALDIR -exec grep -l "<a href=\"http://.*>" {} \; |\
while read FILE ; do
#gradually reduce the matches untill we have a clean url,

grep -o "<a href=\"http://.*>" $FILE | \
grep -o "http://[[:graph:]]*\"" | \
while read URL
do
URL=${URL%'"'} ;#remove trailing double quote

#dump filenames and urls in tempfile for further processing
echo "${FILE};${URL}" >> $TMPFILE
done
done


echo -e "\n\n starting check for broken links ... \n\n"

sort -u <$TMPFILE | while read RECORD;
do
FILE=$( echo $RECORD | cut -d';' -f1 - )
URL=$( echo $RECORD | cut -d';' -f2 - )

echo -e "\n $FILE - checking $URL \n"

wget --spider $URL || BADLINK="true"

if [[ "$BADLINK" = "true" ]]; then
# url not found
echo -e "ERROR retrieving $URL \n\n"

#put file + url on todo-list, and put file on list to upload after fix
echo "$FILE" >>$UPLOADFILE
echo "$FILE - $URL" >>$TODOLIST

BADLINK="noted"
fi
done

# sort and uniq the upload_file
mv $UPLOADFILE $TMPFILE
sort -u <$TMPFILE > $UPLOADFILE && rm $TMPFILE
Binary file added PMWMT/logo_ssc.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 45 additions & 0 deletions PMWMT/sitemap1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
# script to create sitemap.txt
# Koen Noens, October 2006
#http://users.telenet.be/mydotcom/howto/www/sitemap.htm

LOCAL_ROOT="/home/jp/websites/mysite" # replace with your path
SITE_ROOT="http://my.isp.com/my_site" # replace with your site URL
EXTENSIONS=".htm .html .php .asp .aspx .jsp"

pushd $LOCAL_ROOT

#find all .htm, .html, .php, ... pages, remove trailing dot and concatenate with SITE_ROOT

cd $LOCAL_ROOT
rm sitemap.txt || echo "no previous sitemap found"
FOUNDFILES=$(mktemp)

for ext in $EXTENSIONS ; do
find . -name "*$ext" >> $FOUNDFILES
done

# remove leading . and insert site_root to build urls
sed -i 's/\.//' $FOUNDFILES
for FILE in $(cat $FOUNDFILES); do
echo $SITE_ROOT$FILE >> $FOUNDFILES.0
done


# if there is an exclude list, exclude the files in it from the sitemap
empty=""
if [[ -e exclude.lst ]]; then
cat exclude.lst | while read entry; do
sed -i "s,$entry,$empty,g" $FOUNDFILES.0
done;
# remove blank lines as well
sed -i '/^$/d' $FOUNDFILES.0
fi

# finishing touches
sort -f -u $FOUNDFILES.0 >> sitemap.txt
rm $FOUNDFILES.0
rm $FOUNDFILES

# add sitemap to files_to_upload
echo "$LOCAL_ROOT/sitemap.txt" >> $LOCAL_ROOT/upload
69 changes: 69 additions & 0 deletions PMWMT/sitemap2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
# Koen Noens, December 2007
# site map generator
#
# create indented list of hyperlinks to represent a directory listing of a web site ("sitemap")
#http://users.telenet.be/mydotcom/howto/www/sitemap02.txt
#http://users.telenet.be/mydotcom/upub/sitemap.htm

## script gloabal vars
TARGET="/home/me/website"
URLPRE="http://my.hosting.provider.com/mywebsite"
SITEMAP="/home/me/website/sitemap.htm"

EXT="htm"
OUT=""
SKIP="0"

# constants for html tags
HTMLTAB="&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"

## functions
function countIndents {
# count depth in directory three
COUNT=1
STRING=$1

while [[ "$(dirname $STRING)" != "/" ]]; do
STRING=$(dirname $STRING)
let COUNT=$((COUNT + 1));
done
return $COUNT
}


## main
countIndents $TARGET
SKIP=$?

OUT=$(mktemp)
[[ -e $SITEMAP ]] && rm $SITEMAP

### experiment with find and sort to get ordered output
#find $TARGET -type d -o -name "*.$EXT" >> $OUT
find $TARGET -name "*.$EXT" -o -type d >> $OUT
#sort -n -o $OUT $OUT

echo "<html><head><title>sitemap</title></head><body>" >> $SITEMAP
cat $OUT | while read ENTRY ; do
countIndents $ENTRY
let TABS=$(( $? - $SKIP ))

for i in $(seq 0 $TABS); do
echo -n $HTMLTAB >> $SITEMAP
done
echo "<a href=\"$ENTRY\">$(basename $ENTRY)</a><br>" >> $SITEMAP

#progres
echo -n "."
done
echo "</body></html>" >> $SITEMAP

### replace local hierarchy with url-prefix
sed -i "s,$TARGET,$URLPRE,g" $SITEMAP


# cleanup
echo
rm $OUT
exit
122 changes: 122 additions & 0 deletions PMWMT/tkn-cnt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python
# tkn-cnt.py - Count tokens of text string
# Usage: tkn-cnt.py [MODEL|ENCODING] [TEXT|FILE|-]..
# v0.1.6 april/2023 by mountaineerbr
import os
import sys
import getopt
try:
import tiktoken
except:
sys.stderr.write("Err: Install tiktoken module: `pip install tiktoken`\n")
sys.exit(1)


text = ""
mod = "gpt-3.5-turbo"
fallback = "cl100k_base"
#davinci: r50k_base
sn = (sys.argv[0].split("/")[-1])
usage = "\
Usage: %s [-ttv] [MODEL|ENCODING] \"[STRING|FILE|-]..\"\n\
Usage: %s [-hl]\n\
Set \"-\" to read from stdin.\n" % (sn, sn)


def usagef():
sys.stderr.write(usage)

def list_encf():
for enc_name in tiktoken.list_encoding_names():
print(enc_name)


#parse opts
try:
opts, args = getopt.getopt((sys.argv[1:]), "hltv")
except getopt.GetoptError:
print('Error: Unkown option.')
sys.exit(2)

optt, optv, check, check_two = 0, 0, 0, 0
for opt, arg in opts:
if opt == '-h':
usagef()
sys.exit()
elif opt == '-l':
list_encf()
sys.exit()
elif opt == '-t':
optt += 1
elif opt == '-v':
optv += 1


#input, pos args or stdin
if (len(args) > 1) and (args[1] == "-"):
text = sys.stdin.read()
mod = args[0]
elif (len(args) > 1) and (args[0] == "-"):
text = sys.stdin.read()
mod = args[1]
elif (len(args) > 1):
if (os.path.isfile(args[0])) or (os.path.isfile(args[1])):
for file in args:
if os.path.isfile(file):
text += open(file, 'r').read()
if not optv:
sys.stderr.write("File: %s\n" % file)
else:
text = " ".join(args[1:])
if not os.path.isfile(args[0]):
mod = args[0]
check = 1
elif len(args):
if args[0] == "-":
text = sys.stdin.read()
elif os.path.isfile(args[0]):
text = open(args[0], 'r').read()
if not optv:
sys.stderr.write("File: %s\n" % (args[0]))
else:
mod = args[0]
text = args[0]
check_two = 1
else:
usagef()
sys.exit(2)

#model / encoding
try:
enc = tiktoken.encoding_for_model((mod[0:50]))
#sys.stderr.write("Model: %s %s\n" % (mod , str(enc)) )
if check_two:
text = ""
except:
try:
enc = tiktoken.get_encoding((mod[0:50]))
#sys.stderr.write("Encoding: %s\n" % mod )
mod = ""
except:
enc = tiktoken.get_encoding(fallback)
#sys.stderr.write("Warning: Model or encoding not found. Using %s.\n" % fallback)
if check:
text = args[0] + " " + text

#
enc_name = str(enc)
encoded_text = enc.encode_ordinary(text)
#encoded_text = enc.encode(text, disallowed_special=())

if optt > 1:
print(text)
elif optt:
print(encoded_text)
elif optv:
print(len(encoded_text))
else:
print(len(encoded_text),enc_name)

#https://github.com/openai/tiktoken/blob/main/tiktoken/core.py
#https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
#https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
Loading

0 comments on commit 5f29583

Please sign in to comment.