diff --git a/.idea/AIS-CJS.iml b/.idea/AIS-CJS.iml index d0876a7..8388dbc 100644 --- a/.idea/AIS-CJS.iml +++ b/.idea/AIS-CJS.iml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index dc9ea49..d56657a 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/AIS-CJS/AIS_CJS.py b/AIS-CJS/AIS_CJS.py index 36fea5c..139597f 100644 --- a/AIS-CJS/AIS_CJS.py +++ b/AIS-CJS/AIS_CJS.py @@ -1,5 +1,2 @@ -print("asd") -print("gg") -#��ȫ�� �ù߶��Ƥ����� -print("ee") -print("실행도잘됨") \ No newline at end of file + + diff --git a/AIS-CJS/csvjson.json b/AIS-CJS/csvjson.json new file mode 100644 index 0000000..98281dd --- /dev/null +++ b/AIS-CJS/csvjson.json @@ -0,0 +1,28 @@ + + { + "프로그래밍 언어": 100, + "자바": 62, + "자바 프로그래밍": 62, + "파이썬 프로그래밍": 41, + "파이썬": 40, + "프로그래밍 갤러리": 39, + "코딩": 34, + "프로그램": 34, + "게임 프로그래밍": 33, + "c++": 31, + "객체": 30, + "컴퓨터 프로그래밍": 28, + "c 언어 프로그래밍": 28, + "c 언어": 27, + "객체 지향": 25, + "객체 지향 프로그래밍": 25, + "함수형 프로그래밍": 21, + "안드로이드 프로그래밍": 21, + "소켓 프로그래밍": 20, + "시스템 프로그래밍": 20, + "java": 18, + "명품 자바 프로그래밍": 16, + "프로그래머": 15, + "알고리즘": 14, + "개발자": 13 + } diff --git a/AIS-CJS/dataTrans.py b/AIS-CJS/dataTrans.py new file mode 100644 index 0000000..3c35170 --- /dev/null +++ b/AIS-CJS/dataTrans.py @@ -0,0 +1,21 @@ +import socket + +#연결할 Host, Port 정보 +HOST = '175.200.108.201' +PORT = 5000 + +#소켓 생성 +client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + +#서버에 접속 +client_socket.connect((HOST, PORT)) + +#서버에 "Hello world"메세지 전송 +client_socket.sendall("Hello world".encode()); +print("데이터 전송") +#서버에게서 메시지를 수신(에코) +data = client_socket.recv(1024) +print('Received', repr(data.decode())) + +#클라이언트 소켓을 닫는다. +client_socket.close() \ No newline at end of file diff --git a/AIS-CJS/jobAPI.py b/AIS-CJS/jobAPI.py new file mode 100644 index 0000000..4c0f78d --- /dev/null +++ b/AIS-CJS/jobAPI.py @@ -0,0 +1,51 @@ +from urllib.request import urlopen +from urllib.parse import urlencode, unquote, quote_plus +import urllib +import requests +import json +from xml.etree.ElementTree import parse +import xmltodict +from json.decoder import JSONDecoder +from os import error +import glob + +data = [] + +page = 1 +url = "http://openapi.work.go.kr/opi/opi/opia/wantedApi.do" +key = "WNLB0BQ31I58AQ2YDZ4ET2VR1HK" +occupation = "&occupation=023|024|025|026" + +for page in range(1,11): + queryParams = '?' + urlencode({quote_plus('authKey') : 'WNLB0BQ31I58AQ2YDZ4ET2VR1HK', quote_plus('callTp'): 'L', quote_plus('returnType'): 'XML',quote_plus('startPage'): page, quote_plus('display'): '200', + quote_plus('occupation'): '023|024|025|026'}) + + + + request = urllib.request.Request(url + unquote(queryParams)) + + response_body = urlopen(request, timeout=60).read() # get bytes data + + + decode_data = response_body.decode('utf-8') + print(type(decode_data)) + + xml_parse = xmltodict.parse(decode_data)# string인 xml 파싱 + xml_dict = json.loads(json.dumps(xml_parse)) + + print(xml_dict) + with open('job' +str(page) + '.json', 'w') as f: + json.dump(xml_dict, f) + +for f in glob.glob("job*.json"): + with open(f, encoding="utf-8") as infile: + data.append(json.load(infile)) + +with open("job.json",'w', encoding="utf-8") as outfile: + json.dump(data, outfile, ensure_ascii=False, indent="\t") + +#http post 통신 코드 +# headers = {} +# headers = {'content-type': 'application/json'} +# postData = xml_dict +# response = requests.post("https://test.com", headers=headers, data=postData) diff --git a/AIS-CJS/jobSearch.py b/AIS-CJS/jobSearch.py index e5fe8bd..7eb4fab 100644 --- a/AIS-CJS/jobSearch.py +++ b/AIS-CJS/jobSearch.py @@ -1,2 +1,110 @@ -print("부끄럽다") -print("시발") \ No newline at end of file + +def main(): + import requests + from bs4 import BeautifulSoup + import time + import datetime + + # 잡코리아 + + f = open('jobkorea_apply_urls.csv', 'w') + f.write("기업 이름,모집 제목,경력,학력,우대,고용 형태, 급여, 지역, 모집 기간, 이미지 주소" + '\n') + + keyword = "모바일앱" # 키워드 입력 + + # 페이지순서 + for n in range(1, 2): + raw = requests.get( + "https://www.jobkorea.co.kr/Search/?stext={}&tabType=recruit&Page_No=".format(keyword) + str(n) + , headers={'User-Agent': 'Mozilla/5.0'}) + html = BeautifulSoup(raw.text, "html.parser") + results = html.select("li.list-post") + + for ar in results[0:20]: + company_name = ar.select_one("a.name").text.strip() + detail = ar.select_one("a.title").text.strip() + url = 'https://www.jobkorea.co.kr' + ar.find("a")['href'] + exp = ar.select_one("span.exp").text.strip() + location = ar.select_one("span.loc").text.strip() + apply = ar.select_one("div.post-list-apply").text.strip() + company_name = company_name.replace(",", "") + detail = detail.replace(",", "") + location = location.replace(" 외", "") + now = datetime.datetime.now() + #nowDate = now.strftime('%Y-%m-%d') + raw2 = requests.get(url + , headers={'User-Agent': 'Mozilla/5.0'}) + html2 = BeautifulSoup(raw2.text, "html.parser") + #score = str(html2.select("#tab04 > article.artReadStrategy > div > div > div.devStartlist.listArea.specList > div > div.specListWrap > div > ul > li:nth-child(1) > div > span > em"))[5:8] + date_tag1 = html2.select("#tab02 > div.divReadBx.clear.devMakeSameHeight > article.artReadPeriod > div > dl.date > dd:nth-child(2)") + date_tag2 = html2.select("#tab02 > div.divReadBx.clear.devMakeSameHeight > article.artReadPeriod > div > dl.date > dd:nth-child(4)") + prefer_tag = html2.select("#dlPref > dd > span") + region_tag = html2.select("#container > section > div.readSumWrap.clear > article > div.tbRow.clear > div:nth-child(2) > dl > dd:nth-child(6) > a") + pay_tag = html2.select("#container > section > div.readSumWrap.clear > article > div.tbRow.clear > div:nth-child(2) > dl > dd:nth-child(4)") + edu_tag = html2.select("#container > section > div.readSumWrap.clear > article > div.tbRow.clear > div:nth-child(1) > dl > dd:nth-child(4) > strong") + pattern_tag = html2.select("#container > section > div.readSumWrap.clear > article > div.tbRow.clear > div:nth-child(2) > dl > dd:nth-child(2) > ul > li > strong") + image_tag = html2.select("#cologo") + + image = "" + for i in image_tag: + image = i['src'] + if image == "": + image = "이미지 없음" + else: + image = "http:" + image + date = "" + pay = "" + prefer = "" + region = "" + edu = "" + pattern = "" + for tag in date_tag1: + date += "시작일 " + tag.getText() +" " + for tag in date_tag2: + date += "마감일 " + tag.getText() + for tag in prefer_tag: + prefer += tag.getText() + for tag in region_tag: + region += tag.getText() + for tag in pay_tag: + pay += tag.getText() + for tag in edu_tag: + edu += tag.getText() + for tag in pattern_tag: + pattern += tag.getText() + + pay = remove_blank(pay) + pay = pay.replace(",","") + region = remove_blank(region) + prefer = remove_blank(prefer) + pattern = remove_blank(pattern) + edu = remove_blank(edu) + date = remove_blank(date) + if date == "": + date = "상시 채용" + f.write( + company_name + ',' + detail + ',' + exp + ',' + edu + ',' + prefer + ',' + pattern + ',' + pay + ',' + region + ',' + date + ',' + image + '\n') + time.sleep(1) # 1초 + + + print(str(n) + "번째 페이지 내 " + str(keyword) + " 의 채용공고 크롤링을 완료했습니다.") + print("최종 엑셀 작업 마무리중 입니다.") + + f.close() + + + + print("잡코리아 크롤링이 완료되었습니다!!") + + +def remove_blank(string): + string = string.strip() + string = string.replace("\n", "") + string = string.replace("\r", "") + return string + + + +if __name__ == "__main__": + main() + diff --git a/AIS-CJS/main.ui b/AIS-CJS/main.ui new file mode 100644 index 0000000..0814370 --- /dev/null +++ b/AIS-CJS/main.ui @@ -0,0 +1,40 @@ + + + MainWindow + + + + 0 + 0 + 900 + 600 + + + + + 900 + 600 + + + + false + + + MainWindow + + + + + + 0 + 0 + 900 + 21 + + + + + + + + diff --git a/AIS-CJS/text.csv b/AIS-CJS/text.csv new file mode 100644 index 0000000..4cbdff2 --- /dev/null +++ b/AIS-CJS/text.csv @@ -0,0 +1,26 @@ +Ű, +α׷ ,100 +ڹ,62 +ڹ α׷,62 +̽ α׷,41 +̽,40 +α׷ ,39 +ڵ,34 +α׷,34 + α׷,33 +c++,31 +ü,30 +ǻ α׷,28 +c α׷,28 +c ,27 +ü ,25 +ü α׷,25 +Լ α׷,21 +ȵ̵ α׷,21 + α׷,20 +ý α׷,20 +java,18 +ǰ ڹ α׷,16 +α׷,15 +˰,14 +,13 diff --git a/AIS-CJS/wordCloud.py b/AIS-CJS/wordCloud.py new file mode 100644 index 0000000..0938c87 --- /dev/null +++ b/AIS-CJS/wordCloud.py @@ -0,0 +1,20 @@ + +import json +import matplotlib.pyplot as plt + +from wordcloud import WordCloud + +inputFileName = 'csvjson' +data = json.loads(open(inputFileName+'.json', 'r', encoding= 'utf-8').read()) +font_path = "C:/Users/dofury/AppData/Local/Microsoft/Windows/Fonts/MaruBuri-Bold.ttf" +wc = WordCloud(font_path=font_path, background_color='ivory', width=800, height=600) + + + + +cloud = wc.fit_words(data) +plt.Figure(figsize=(15, 20)) +plt.imshow(cloud) +plt.axis('off') +plt.savefig("wordcloud.png") +plt.show() \ No newline at end of file