[python] WeChati avaliku konto artiklite roomamine

Wechat Public Account Crawling



nõudlus

Indekseerige mõned WeChati avaliku konto artiklid

Andmeallikad

1. Sogou WeChat otsing, saate otsida WeChat ametlikest kontotest, kuid kuvada saab ainult kümmet ametliku konto artiklit
2. Vaadake muid WeChati ametlikke kontoartikleid isikliku WeChati ametliku konto materjalihalduse kaudu



pilt
pilt



pilt



samm

1. Hankige veebisaidilt käsitsi küpsiseid ja logige küpsiste kaudu sisse
2. Hankige luba päringu URL-ist
3. Parameetrite päringu liitmine https://mp.weixin.qq.com/cgi-bin/searchbiz Fakeid ametliku konto saamiseks on biz
4. Parameetrite päringu ühendamine https://mp.weixin.qq.com/cgi-bin/appmsg? Teave artiklite loendi kohta
5. Indekseerige artikleid läbi URL-i

Sel moel pole võimalik saada lugemiste arvu ja meeldimiste arvu, sest pole palju lugemisi ja meeldimisi, kui veebileht avab ametliku konto.

Kood

githubi lao aadress



import requests import json import re import time class WeChatCrawler(): def __init__(self, wxList): self.wxList = wxList self.cookies = self.__getCookiesFromText() self.token = self.__getToken() self.headers = { 'HOST': 'mp.weixin.qq.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1 WOW64 rv:53.0) Gecko/20100101 Firefox/53.0' } self.searchBizParam = { 'action': 'search_biz', 'token': self.token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'query': '', 'begin': '0', 'count': '5', } self.getMsgListParam = { 'token': self.token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'action': 'list_ex', 'begin': '0', 'count': '5', 'query': '', 'fakeid': '', 'type': '9' } def __getCookiesFromText(self): # Manually get cookies with open('cookie.txt', 'r', encoding='utf-8') as f: cookieStr = f.read() # Processing cookieStr format into json cookieStr = '{'' + cookieStr + ''}' cookieStr = cookieStr.replace('rewardsn=', '').replace('', '','').replace('=', '':'').replace( '':''', '='').replace(' ', '') # print(cookieStr) cookies = json.loads(cookieStr) return cookies def __getToken(self): url = 'https://mp.weixin.qq.com' response = requests.get(url=url, cookies=self.cookies) token = re.findall(r'token=(d+)', str(response.url))[0] return token def __getWXFakeid(self, wx): searchUrl = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?' self.searchBizParam['query'] = wx searchResponse = requests.get(searchUrl, cookies=self.cookies, headers=self.headers, params=self.searchBizParam) fakeid = searchResponse.json().get('list')[0].get('fakeid') return fakeid def __getWXMsgCnt(self, fakeId): self.getMsgListParam['fakeid'] = fakeId appmsgUrl = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' appmsgResponse = requests.get(appmsgUrl, cookies=self.cookies, headers=self.headers, params=self.getMsgListParam) wxMsgCnt = appmsgResponse.json().get('app_msg_cnt') return wxMsgCnt def __getWXMsgList(self, fakeId): appmsgUrl = 'https://mp.weixin.qq.com/cgi-bin/appmsg?' wxMsgCnt = self.__getWXMsgCnt(fakeId) if wxMsgCnt is not None: pages = int(wxMsgCnt) // 5 begin = 0 for _ in range(pages): print('====page turning ====', begin) self.getMsgListParam['begin'] = str(begin) msgListResponse = requests.get(appmsgUrl, cookies=self.cookies, headers=self.headers, params=self.getMsgListParam) msgList = msgListResponse.json().get('app_msg_list') for item in msgList: # todo more msgLink = item.get('link') print(msgLink) msgTitle = item.get('title') print(msgTitle) begin += 5 time.sleep(3) def runCrawler(self): fakeIds = list(map(self.__getWXFakeid, self.wxList)) list(map(self.__getWXMsgList, fakeIds)) if __name__ == '__main__': # example wxList = ['Qubit', ] wc = WeChatCrawler(wxList) wc.runCrawler()