Mythsman


乐极生悲,苦尽甘来。


解决Hdoj3337问题的简易爬虫

这是好久前遇到的一道非主流题,当时愣是没弄明白题意。最近闲着没事翻开来看了看,并在网上找到了某大牛写的爬虫,写的真美,顿觉的有必要收藏一下。虽然现在不能完全看懂,但是我想不久的将来,当我想系统的学Python的时候,这肯定是很有用的东西。
Hdoj3337

题目非常短,主要是这句话:

There is only one line in the input. It is a sentence which implies some integer. The length of the sentence is no more than 16. The sentence contains only letters(both uppercase and lowercase), digits, and whitespace. You may assume the given sentence is written in standard English, and always implies exactly one single integer.

意思就是给你一句话,让你写出这句话“imply”的一个数字。。。然而你并不知道imply是什么意思,所以你必须知道他给的这句话是什么,因此解决这道题的标准做法就是通过OJ提供的 AC,WA,TLE,MLE 等提示来对样例进行数据挖掘,具体做法就是类似二分的东西,通过多次提交来确定每一位的字母是什么。

以下是大牛写的程序:

import requests
import time
import sys
status_url = 'http://acm.hdu.edu.cn/status.php'
submit_url = 'http://acm.hdu.edu.cn/submit.php'
login_url = 'http://acm.hdu.edu.cn/userloginex.php'
homepage_url = 'http://acm.hdu.edu.cn/'
language = {}
language['G++'] = 1
language['GCC'] = 2
language['C++'] = 3
language['C'] = 4
language['Pascal'] = 5
language['Java'] = 6
language['C#'] = 7
file_data = []
previous_runid = ''
greater = 'Time Limit Exceeded'
less = 'Output Limit Exceeded'
equal = 'Wrong Answer'
username = 'your username'
userpass = 'your password'
template_input_data = '''
#include <cstdio>
#include <cstdlib>
char buffer[0x10000];//64KB
// Wrong Answer
void equal(void)
{
	printf("www.hoxily.com\n");
	exit(0);
}
// OLE
void less(void)
{
	while (true)
	{
		fwrite(buffer, sizeof(char), sizeof(buffer), stdout);
	}
}
// TLE
void greater(void)
{
	while (true)
	{
	}
}

int main(void)
{
	int ch;
	int count = 0;
	while ((ch = getchar()) != EOF)
	{
		if (ch < 0)
		{
			ch = ch + 256;
		}
		if (count == skip)
		{
			if (ch == guess)
			{
				equal();
			}
			else if (ch < guess)
			{
				less();
			}
			else
			{
				greater();
			}
		}
		count++;
	}
	return 0;
}
'''
template_input_length = '''
#include <cstdio>
#include <cstdlib>
#define MAXLEN 0x7fffffff
#define MINLEN 0
char buffer[0x10000];//64KB
// Wrong Answer
void equal(void)
{
	printf("www.hoxily.com\n");
	exit(0);
}
// OLE
void less(void)
{
	while (true)
	{
		fwrite(buffer, sizeof(char), sizeof(buffer), stdout);
	}
}
// TLE
void greater(void)
{
	while (true)
	{
	}
}

int main(void)
{
	int ch;
	int count = 0;
	while ((ch = getchar()) != EOF)
	{
		count++;
	}
	if (count < guess)
	{
		less();
	}
	else if (count == guess)
	{
		equal();
	}
	else
	{
		greater();
	}
	return 0;
}
'''
cookies = {'exesubmitlang':'0', 'CNZZDATA1254072405': '965477099-1421749361-|1427515514'}
pid = 1000
def login(referer):
    global cookies
    headers = {'referer':referer, 'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0'}
    r = requests.post(login_url, params={'action':'login'}, data={'username':username,'userpass':userpass, 'login':'Sign+In'}, headers = headers, cookies=cookies)
    if r.cookies.get('PHPSESSID') is not None:
        cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')
    #print(repr(cookies))
def test_length_submit(guess):
    usercode = 'int guess = ' + str(guess) + ';rn' + template_input_length
    return test_submit(usercode)
def find_input_length():
    print('GUESS', 'RESULT')
    guess = 1
    left = 0
    while True:
        result = test_length_submit(guess)
        print(guess, result)
        if result == 'lt':
            break
        else:
            left = guess
            guess = guess * 2
    print('LEFT', 'RIGHT', 'GUESS', 'RESULT')
    right = guess
    while left <= right:
        guess = (left + right) // 2
        result = test_length_submit(guess)
        print(left, right, guess, result)
        if result == 'eq':
            print('length =', guess)
            break
        elif result == 'gt':
            left = guess + 1
        elif result == 'lt':
            right = guess - 1
    return guess

def test_submit(usercode):
    global cookies
    headers = {'referer':'http://acm.hdu.edu.cn/showproblem.php?pid=' + str(pid), 'user-agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0'}
    r = requests.get(submit_url, params={'pid':pid}, allow_redirects=False, cookies=cookies, headers=headers)
    if r.cookies.get('PHPSESSID') is not None:
        cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')
    #print(repr(cookies))
    if r.status_code == 302:
        # need login
        login(r.url)
    r = requests.post(submit_url, params={'action':'submit'}, data={'check':0, 'problemid':pid, 'language':language['C++']-1, 'usercode':usercode}, allow_redirects=False, cookies = cookies, headers=headers)
    if r.cookies.get('PHPSESSID') is not None:
        cookies['PHPSESSID'] = r.cookies.get('PHPSESSID')
    #print(repr(cookies))
    # if r.status_code != 302:
        # raise Exception(r)
    # else:
    global previous_runid
    while True:
        time.sleep(1)
        r = requests.get(status_url, params={'user':username, 'lang':0, 'status':0}, headers=headers,cookies=cookies)
        text = r.text
        try:
            index = text.index('tr align=center')
        except ValueError:
            # server is busy so that return something else.
            time.sleep(5)
            continue
        index = text.index('px>', index)
        index += len('px>')
        endIndex = text.index('</td>', index)
        runid = text[index:endIndex]
        if runid == previous_runid: # submit failed
            return test_submit(usercode)
        index = text.index('green>', index)
        index = index + len('green>')
        endIndex = text.index('</font>', index)
        status = text[index: endIndex]
        if 'Queuing' in status:
            continue
        elif 'Compiling' in status:
            continue
        elif 'Running' in status:
            continue
        elif less in status:
            previous_runid = runid
            return 'lt'
        elif greater in status:
            previous_runid = runid
            return 'gt'
        elif equal in status:
            previous_runid = runid
            return 'eq'
        else:
            raise Exception('unknow judge status: ' + status)
def test_data_submit(skip, guess):
    usercode = 'int guess = ' + str(guess) + ';rn'
    usercode = usercode + 'int skip = ' + str(skip) + ';rn'
    usercode = usercode + template_input_data
    return test_submit(usercode)
def find_input_data():
    length = find_input_length()
    print('LEFT', 'RIGHT', 'SKIP', 'GUESS', 'RESULT')
    for skip in range(length):
        left = 0
        right = 255
        while left <= right:
            guess = (left + right) // 2
            result = test_data_submit(skip, guess)
            print(left, right, skip, guess, result)
            if result == 'eq':
                file_data.append(guess)
                print(repr(file_data))
                break
            elif result == 'lt':
                right = guess -1
            elif result == 'gt':
                left = guess + 1
        if left > right:
            file_data.append('UNKNOWN')
    return file_data
if len(sys.argv) == 1:
    print('usage:ndiginput.py <pid> <username> <userpass>')
    sys.exit(1)
else:
    pid = int(sys.argv[1])
    username = sys.argv[2]
    userpass = sys.argv[3]
    find_input_data()

代码的思路也很清楚,就是对每一个字母用二分查找爬出他的值。

执行一下:
myths@myths-X450LD:~/Desktop$ python hdu.py 3337 <用户名> <密码>
等个十几分钟,大概系统提交了100多发,得到了输出结果:

('GUESS', 'RESULT')
(1, 'gt')
(2, 'gt')
(4, 'gt')
(8, 'gt')
(16, 'lt')
('LEFT', 'RIGHT', 'GUESS', 'RESULT')
(8, 16, 12, 'gt')
(13, 16, 14, 'eq')
('length =', 14)
('LEFT', 'RIGHT', 'SKIP', 'GUESS', 'RESULT')
(0, 255, 0, 127, 'lt')
(0, 126, 0, 63, 'gt')
(64, 126, 0, 95, 'lt')
(64, 94, 0, 79, 'gt')
(80, 94, 0, 87, 'lt')
(80, 86, 0, 83, 'gt')
(84, 86, 0, 85, 'lt')
(84, 84, 0, 84, 'eq')
[84]
(0, 255, 1, 127, 'lt')
(0, 126, 1, 63, 'gt')
(64, 126, 1, 95, 'gt')
(96, 126, 1, 111, 'lt')
(96, 110, 1, 103, 'gt')
(104, 110, 1, 107, 'lt')
(104, 106, 1, 105, 'lt')
(104, 104, 1, 104, 'eq')
[84, 104]
(0, 255, 2, 127, 'lt')
(0, 126, 2, 63, 'gt')
(64, 126, 2, 95, 'gt')
(96, 126, 2, 111, 'lt')
(96, 110, 2, 103, 'lt')
(96, 102, 2, 99, 'gt')
(100, 102, 2, 101, 'eq')
[84, 104, 101]
(0, 255, 3, 127, 'lt')
(0, 126, 3, 63, 'lt')
(0, 62, 3, 31, 'gt')
(32, 62, 3, 47, 'lt')
(32, 46, 3, 39, 'lt')
(32, 38, 3, 35, 'lt')
(32, 34, 3, 33, 'lt')
(32, 32, 3, 32, 'eq')
[84, 104, 101, 32]
(0, 255, 4, 127, 'lt')
(0, 126, 4, 63, 'gt')
(64, 126, 4, 95, 'lt')
(64, 94, 4, 79, 'lt')
(64, 78, 4, 71, 'lt')
(64, 70, 4, 67, 'lt')
(64, 66, 4, 65, 'eq')
[84, 104, 101, 32, 65]
(0, 255, 5, 127, 'lt')
(0, 126, 5, 63, 'gt')
(64, 126, 5, 95, 'lt')
(64, 94, 5, 79, 'gt')
(80, 94, 5, 87, 'lt')
(80, 86, 5, 83, 'eq')
[84, 104, 101, 32, 65, 83]
(0, 255, 6, 127, 'lt')
(0, 126, 6, 63, 'gt')
(64, 126, 6, 95, 'lt')
(64, 94, 6, 79, 'lt')
(64, 78, 6, 71, 'lt')
(64, 70, 6, 67, 'eq')
[84, 104, 101, 32, 65, 83, 67]
(0, 255, 7, 127, 'lt')
(0, 126, 7, 63, 'gt')
(64, 126, 7, 95, 'lt')
(64, 94, 7, 79, 'lt')
(64, 78, 7, 71, 'gt')
(72, 78, 7, 75, 'lt')
(72, 74, 7, 73, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73]
(0, 255, 8, 127, 'lt')
(0, 126, 8, 63, 'gt')
(64, 126, 8, 95, 'lt')
(64, 94, 8, 79, 'lt')
(64, 78, 8, 71, 'gt')
(72, 78, 8, 75, 'lt')
(72, 74, 8, 73, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73]
(0, 255, 9, 127, 'lt')
(0, 126, 9, 63, 'lt')
(0, 62, 9, 31, 'gt')
(32, 62, 9, 47, 'lt')
(32, 46, 9, 39, 'lt')
(32, 38, 9, 35, 'lt')
(32, 34, 9, 33, 'lt')
(32, 32, 9, 32, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32]
(0, 255, 10, 127, 'lt')
(0, 126, 10, 63, 'gt')
(64, 126, 10, 95, 'gt')
(96, 126, 10, 111, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111]
(0, 255, 11, 127, 'lt')
(0, 126, 11, 63, 'gt')
(64, 126, 11, 95, 'gt')
(96, 126, 11, 111, 'lt')
(96, 110, 11, 103, 'lt')
(96, 102, 11, 99, 'gt')
(100, 102, 11, 101, 'gt')
(102, 102, 11, 102, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111, 102]
(0, 255, 12, 127, 'lt')
(0, 126, 12, 63, 'lt')
(0, 62, 12, 31, 'gt')
(32, 62, 12, 47, 'lt')
(32, 46, 12, 39, 'lt')
(32, 38, 12, 35, 'lt')
(32, 34, 12, 33, 'lt')
(32, 32, 12, 32, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111, 102, 32]
(0, 255, 13, 127, 'lt')
(0, 126, 13, 63, 'gt')
(64, 126, 13, 95, 'lt')
(64, 94, 13, 79, 'lt')
(64, 78, 13, 71, 'gt')
(72, 78, 13, 75, 'lt')
(72, 74, 13, 73, 'lt')
(72, 72, 13, 72, 'eq')
[84, 104, 101, 32, 65, 83, 67, 73, 73, 32, 111, 102, 32, 72]

最后的一串数字转换为ASCII码就是"The ASCII of H"了。